diff options
Diffstat (limited to 'llvm/lib')
17 files changed, 195 insertions, 58 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 4ad48e351a4..867403d0ef1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -268,6 +268,10 @@ bool TargetTransformInfo::enableInterleavedAccessVectorization() const { return TTIImpl->enableInterleavedAccessVectorization(); } +bool TargetTransformInfo::enableMaskedInterleavedAccessVectorization() const { + return TTIImpl->enableMaskedInterleavedAccessVectorization(); +} + bool TargetTransformInfo::isFPVectorizationPotentiallyUnsafe() const { return TTIImpl->isFPVectorizationPotentiallyUnsafe(); } @@ -515,9 +519,9 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + unsigned Alignment, unsigned AddressSpace, bool IsMasked) const { + int Cost = TTIImpl->getInterleavedMemoryOpCost( + Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 272c665ace1..e14449b8838 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -502,6 +502,16 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) { return Inst; } +Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, + unsigned ReplicationFactor, unsigned VF) { + SmallVector<Constant *, 16> MaskVec; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < ReplicationFactor; j++) + MaskVec.push_back(Builder.getInt32(i)); + + return ConstantVector::get(MaskVec); +} + Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF, unsigned NumVecs) { SmallVector<Constant *, 16> Mask; @@ -672,7 +682,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses( // this group because it and (2) are dependent. However, (1) can be grouped // with other accesses that may precede it in program order. Note that a // bottom-up order does not imply that WAW dependences should not be checked. -void InterleavedAccessInfo::analyzeInterleaving() { +void InterleavedAccessInfo::analyzeInterleaving( + bool EnablePredicatedInterleavedMemAccesses) { LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n"); const ValueToValueMap &Strides = LAI->getSymbolicStrides(); @@ -712,9 +723,8 @@ void InterleavedAccessInfo::analyzeInterleaving() { // create a group for B, we continue with the bottom-up algorithm to ensure // we don't break any of B's dependences. InterleaveGroup *Group = nullptr; - // TODO: Ignore B if it is in a predicated block. This restriction can be - // relaxed in the future once we handle masked interleaved groups. - if (isStrided(DesB.Stride) && !isPredicated(B->getParent())) { + if (isStrided(DesB.Stride) && + (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) { Group = getInterleaveGroup(B); if (!Group) { LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B @@ -808,11 +818,12 @@ void InterleavedAccessInfo::analyzeInterleaving() { if (DistanceToB % static_cast<int64_t>(DesB.Size)) continue; - // Ignore A if either A or B is in a predicated block. Although we - // currently prevent group formation for predicated accesses, we may be - // able to relax this limitation in the future once we handle more - // complicated blocks. - if (isPredicated(A->getParent()) || isPredicated(B->getParent())) + // All members of a predicated interleave-group must have the same predicate, + // and currently must reside in the same BB. + BasicBlock *BlockA = A->getParent(); + BasicBlock *BlockB = B->getParent(); + if ((isPredicated(BlockA) || isPredicated(BlockB)) && + (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB)) continue; // The index of A is the index of B plus A's distance to B in multiples diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 96e751e8697..a16de89cf10 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -659,11 +659,12 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); - if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -676,7 +677,7 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index c056a7d2428..b3893d32850 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -146,7 +146,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 1b0d162f726..90e0cd96682 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -542,14 +542,16 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; - if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { + if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && + !IsMasked) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -562,7 +564,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 7d14bd7c256..84e3055c6bc 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -169,7 +169,7 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 4d0e7dc52e8..79b269bccfe 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -206,10 +206,10 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace) { - if (Indices.size() != Factor) + unsigned Alignment, unsigned AddressSpace, bool IsMasked) { + if (Indices.size() != Factor || IsMasked) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 2c03cd268ff..901a91692e8 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,7 +123,7 @@ public: bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index b0da9b5a6d7..f67bacc87ec 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -473,7 +473,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); + assert(isa<VectorType>(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 2ee2b3eb808..252d46e7a2a 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -90,7 +90,8 @@ public: unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, + bool IsMasked = false); /// @} }; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 6f553d5bed3..1eaeb9699bf 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -909,7 +909,11 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); assert(isa<VectorType>(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index bfa942357c5..92b2b9bdcb8 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,7 +92,7 @@ public: unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace); + unsigned AddressSpace, bool IsMasked = false); /// @} }; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index d3a75123935..82e4dfe25b7 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2723,7 +2723,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2832,7 +2837,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { + + if (IsMasked) + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, IsMasked); // VecTy for interleave memop is <VF*Factor x Elt>. // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -2950,7 +2960,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace) { + unsigned AddressSpace, + bool IsMasked) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -2962,11 +2973,11 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace); + Alignment, AddressSpace, IsMasked); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 3df89903882..2bd778a4211 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -101,13 +101,16 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace); + unsigned Alignment, unsigned AddressSpace, + bool IsMasked = false); int getIntImmCost(int64_t); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 7ebe8d102b7..e93cfb34156 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,10 @@ static cl::opt<bool> EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); +static cl::opt<bool> EnableMaskedInterleavedMemAccesses( + "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, + cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); + /// We don't interleave loops with a known constant trip count below this /// number. static const unsigned TinyTripCountInterleaveThreshold = 128; @@ -408,8 +412,10 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to. - void vectorizeInterleaveGroup(Instruction *Instr); + /// Try to vectorize the interleaved access group that \p Instr belongs to, + /// optionally masking the vector operations if \p BlockInMask is non-null. + void vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask = nullptr); /// Vectorize Load and Store instructions, optionally masking the vector /// operations if \p BlockInMask is non-null. @@ -1112,6 +1118,11 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + /// Returns true if \p I is a memory instruction in an interleaved-group + /// of memory accesses that can be vectorized with wide vector loads/stores + /// and shuffles. + bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1); + /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { return InterleaveInfo.isInterleaved(Instr); @@ -1946,7 +1957,8 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { // %interleaved.vec = shuffle %R_G.vec, %B_U.vec, // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B -void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, + VectorParts *BlockInMask) { const InterleaveGroup *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -1968,6 +1980,15 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { SmallVector<Value *, 2> NewPtrs; unsigned Index = Group->getIndex(Instr); + VectorParts Mask; + bool IsMaskRequired = BlockInMask; + if (IsMaskRequired) { + Mask = *BlockInMask; + // TODO: extend the masked interleaved-group support to reversed access. + assert(!Group->isReverse() && "Reversed masked interleave-group " + "not supported."); + } + // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, // rather than directly getting the pointer for lane VF - 1, because the @@ -2011,8 +2032,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { - auto *NewLoad = Builder.CreateAlignedLoad( - NewPtrs[Part], Group->getAlignment(), "wide.vec"); + Instruction *NewLoad; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + ShuffledMask, UndefVec, + "wide.masked.vec"); + } + else + NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], + Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); } @@ -2079,8 +2111,18 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, "interleaved.vec"); - Instruction *NewStoreInstr = - Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlignment()); + Instruction *NewStoreInstr; + if (IsMaskRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + NewStoreInstr = Builder.CreateMaskedStore( + IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + } + else + NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } @@ -4253,6 +4295,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } +static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { + if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) + return TTI.enableMaskedInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + return EnableMaskedInterleavedMemAccesses; +} + +bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, + unsigned VF) { + assert(isAccessInterleaved(I) && "Expecting interleaved access."); + assert(getWideningDecision(I, VF) == CM_Unknown && + "Decision should not be set yet."); + + if (!Legal->blockNeedsPredication(I->getParent()) || + !Legal->isMaskRequired(I)) + return true; + + if (!useMaskedInterleavedAccesses(TTI)) + return false; + + auto *Ty = getMemInstValueType(I); + return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) + : TTI.isLegalMaskedStore(Ty); +} + bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, unsigned VF) { // Get and ensure we have a valid memory instruction. @@ -5371,13 +5439,17 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, - Group->getFactor(), Indices, - Group->getAlignment(), AS); - - if (Group->isReverse()) + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, Group->getFactor(), Indices, + Group->getAlignment(), AS, Legal->isMaskRequired(I)); + + if (Group->isReverse()) { + // TODO: Add support for reversed masked interleaved access. + assert(!Legal->isMaskRequired(I) && + "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + } return Cost; } @@ -5479,7 +5551,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { continue; NumAccesses = Group->getNumMembers(); - InterleaveCost = getInterleaveGroupCost(&I, VF); + if (interleavedAccessCanBeWidened(&I, VF)) + InterleaveCost = getInterleaveGroupCost(&I, VF); } unsigned GatherScatterCost = @@ -6152,7 +6225,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { } VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range) { + VFRange &Range, + VPlanPtr &Plan) { const InterleaveGroup *IG = CM.getInterleavedAccessGroup(I); if (!IG) return nullptr; @@ -6174,7 +6248,11 @@ VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, assert(I == IG->getInsertPos() && "Generating a recipe for an adjunct member of an interleave group"); - return new VPInterleaveRecipe(IG); + VPValue *Mask = nullptr; + if (Legal->isMaskRequired(I)) + Mask = createBlockInMask(I->getParent(), Plan); + + return new VPInterleaveRecipe(IG, Mask); } VPWidenMemoryInstructionRecipe * @@ -6442,7 +6520,7 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPRecipeBase *Recipe = nullptr; // Check if Instr should belong to an interleave memory recipe, or already // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range))) { + if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { VPBB->appendRecipe(Recipe); return true; } @@ -6669,6 +6747,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); + if (User) { + O << ", "; + User->getOperand(0)->printAsOperand(O); + } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) @@ -6731,7 +6813,15 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + if (!User) + return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); + + // Last (and currently only) operand is a mask. + InnerLoopVectorizer::VectorParts MaskValues(State.UF); + VPValue *Mask = User->getOperand(User->getNumOperands() - 1); + for (unsigned Part = 0; Part < State.UF; ++Part) + MaskValues[Part] = State.get(Mask, Part); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7030,7 +7120,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Analyze interleaved memory accesses. if (UseInterleaved) { - IAI.analyzeInterleaving(); + IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); } // Use the cost model. diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index f43a8bb123b..15d38ac9c84 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -69,7 +69,8 @@ public: /// \return value is <true, nullptr>, as it is handled by another recipe. /// \p Range.End may be decreased to ensure same decision from \p Range.Start /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range); + VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c3123b41600..81b1986c97d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -769,10 +769,14 @@ public: class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup *IG; + std::unique_ptr<VPUser> User; public: - VPInterleaveRecipe(const InterleaveGroup *IG) - : VPRecipeBase(VPInterleaveSC), IG(IG) {} + VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG) { + if (Mask) // Create a VPInstruction to register as a user of the mask. + User.reset(new VPUser({Mask})); + } ~VPInterleaveRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. |