diff options
author | Dorit Nuzman <dorit.nuzman@intel.com> | 2018-10-31 09:57:56 +0000 |
---|---|---|
committer | Dorit Nuzman <dorit.nuzman@intel.com> | 2018-10-31 09:57:56 +0000 |
commit | 34da6dd696439e195e7b650d97a95913101a88d9 (patch) | |
tree | a718d6a89ceb39ada3675f96f8de45c051e8ce7f /llvm/lib | |
parent | 889356eb719ded45c708514fb03777f705eb5934 (diff) | |
download | bcm5719-llvm-34da6dd696439e195e7b650d97a95913101a88d9.tar.gz bcm5719-llvm-34da6dd696439e195e7b650d97a95913101a88d9.zip |
[LV] Support vectorization of interleave-groups that require an epilog under
optsize using masked wide loads
Under Opt for Size, the vectorizer does not vectorize interleave-groups that
have gaps at the end of the group (such as a loop that reads only the even
elements: a[2*i]) because that implies that we'll require a scalar epilogue
(which is not allowed under Opt for Size). This patch extends the support for
masked-interleave-groups (introduced by D53011 for conditional accesses) to
also cover the case of gaps in a group of loads; Targets that enable the
masked-interleave-group feature don't have to invalidate interleave-groups of
loads with gaps; they could now use masked wide-loads and shuffles (if that's
what the cost model selects).
Reviewers: Ayal, hsaito, dcaballe, fhahn
Reviewed By: Ayal
Differential Revision: https://reviews.llvm.org/D53668
llvm-svn: 345705
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Analysis/TargetTransformInfo.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Analysis/VectorUtils.cpp | 24 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 9 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h | 3 | ||||
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp | 8 | ||||
-rw-r--r-- | llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h | 4 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 28 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.h | 9 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 114 |
15 files changed, 173 insertions, 70 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 867403d0ef1..6e4eb8ff0cd 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -519,9 +519,12 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, int TargetTransformInfo::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace, bool IsMasked) const { - int Cost = TTIImpl->getInterleavedMemoryOpCost( - Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked); + unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, + bool UseMaskForGaps) const { + int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, + Alignment, AddressSpace, + UseMaskForCond, + UseMaskForGaps); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 8b6702c8544..38dca50e82a 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -504,6 +504,25 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) { return Inst; } +Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF, + const InterleaveGroup &Group) { + // All 1's means mask is not needed. + if (Group.getNumMembers() == Group.getFactor()) + return nullptr; + + // TODO: support reversed access. + assert(!Group.isReverse() && "Reversed group not supported."); + + SmallVector<Constant *, 16> Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < Group.getFactor(); ++j) { + unsigned HasMember = Group.getMember(j) ? 1 : 0; + Mask.push_back(Builder.getInt1(HasMember)); + } + + return ConstantVector::get(Mask); +} + Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, unsigned ReplicationFactor, unsigned VF) { SmallVector<Constant *, 16> MaskVec; @@ -935,9 +954,10 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() { } for (auto *Ptr : DelSet) { LLVM_DEBUG( - dbgs() + dbgs() << "LV: Invalidate candidate interleaved group due to gaps that " - "require a scalar epilogue.\n"); + "require a scalar epilogue (not allowed under optsize) and cannot " + "be masked (not enabled). \n"); releaseGroup(Ptr); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 77c83970f68..a256cb7c921 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -660,11 +660,13 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool UseMaskForCond, + bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); - if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!UseMaskForCond && !UseMaskForGaps && + Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -677,7 +679,8 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); } int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index b3893d32850..08c1a892422 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -146,7 +146,9 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked = false); + unsigned AddressSpace, + bool UseMaskForCond = false, + bool UseMaskForGaps = false); bool shouldConsiderAddressTypePromotion(const Instruction &I, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index a07c1e83a3f..f72bb8632eb 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -564,7 +564,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool UseMaskForCond, + bool UseMaskForGaps) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); @@ -572,7 +573,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits && - !IsMasked) { + !UseMaskForCond && !UseMaskForGaps) { unsigned NumElts = VecTy->getVectorNumElements(); auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); @@ -585,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); } void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 84e3055c6bc..2dd143d48a1 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -169,7 +169,9 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked); + unsigned AddressSpace, + bool UseMaskForCond = false, + bool UseMaskForGaps = false); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 79b269bccfe..4c671460c90 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -206,10 +206,12 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy, unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace, bool IsMasked) { - if (Indices.size() != Factor || IsMasked) + unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond, + bool UseMaskForGaps) { + if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr); } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 901a91692e8..5c6f85584ec 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -123,7 +123,8 @@ public: bool VariableMask, unsigned Alignment); unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked); + unsigned AddressSpace, bool UseMaskForCond = false, + bool UseMaskForGaps = false); unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, const Instruction *I); unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index f67bacc87ec..bc9bcab83a0 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -474,10 +474,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { - if (IsMasked) + bool UseMaskForCond, + bool UseMaskForGaps) { + if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); assert(isa<VectorType>(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 252d46e7a2a..9221a910288 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -91,7 +91,8 @@ public: ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool UseMaskForCond = false, + bool UseMaskForGaps = false); /// @} }; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index caa3f597445..94db56e3738 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -969,10 +969,12 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { - if (IsMasked) + bool UseMaskForCond, + bool UseMaskForGaps) { + if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); assert(isa<VectorType>(VecTy) && "Expect a vector type for interleaved memory op"); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 347a8a632f0..406f075c8a6 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -93,7 +93,9 @@ public: unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, - unsigned AddressSpace, bool IsMasked = false); + unsigned AddressSpace, + bool UseMaskForCond = false, + bool UseMaskForGaps = false); /// @} }; diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 8d8bc0b35cb..ebb8aca5fb1 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2784,11 +2784,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool UseMaskForCond, + bool UseMaskForGaps) { - if (IsMasked) + if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); // We currently Support only fully-interleaved groups, with no gaps. // TODO: Support also strided loads (interleaved-groups with gaps). @@ -2898,11 +2900,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool UseMaskForCond, + bool UseMaskForGaps) { - if (IsMasked) + if (UseMaskForCond || UseMaskForGaps) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); // VecTy for interleave memop is <VF*Factor x Elt>. // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -3021,7 +3025,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked) { + bool UseMaskForCond, + bool UseMaskForGaps) { auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || @@ -3033,11 +3038,14 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, }; if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, IsMasked); + Alignment, AddressSpace, + UseMaskForCond, UseMaskForGaps); } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 2bd778a4211..1637592c81f 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -102,15 +102,18 @@ public: int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool UseMaskForCond = false, + bool UseMaskForGaps = false); int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool UseMaskForCond = false, + bool UseMaskForGaps = false); int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace, - bool IsMasked = false); + bool UseMaskForCond = false, + bool UseMaskForGaps = false); int getIntImmCost(int64_t); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ffa6b242e00..23d4a6b2166 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,8 @@ static cl::opt<bool> EnableInterleavedMemAccesses( "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop")); +/// An interleave-group may need masking if it resides in a block that needs +/// predication, or in order to mask away gaps. static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); @@ -1134,11 +1136,15 @@ public: } /// Returns true if an interleaved group requires a scalar iteration - /// to handle accesses with gaps. + /// to handle accesses with gaps, and there is nothing preventing us from + /// creating a scalar epilogue. bool requiresScalarEpilogue() const { - return InterleaveInfo.requiresScalarEpilogue(); + return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); } + /// Returns true if a scalar epilogue is not allowed due to optsize. + bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } + /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } @@ -1229,6 +1235,15 @@ private: /// vectorization as a predicated block. SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; + /// Records whether it is allowed to have the original scalar loop execute at + /// least once. This may be needed as a fallback loop in case runtime + /// aliasing/dependence checks fail, or to handle the tail/remainder + /// iterations when the trip count is unknown or doesn't divide by the VF, + /// or as a peel-loop to handle gaps in interleave-groups. + /// Under optsize and when the trip count is very small we don't allow any + /// iterations to execute in the scalar loop. + bool IsScalarEpilogueAllowed = true; + /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; @@ -1938,6 +1953,17 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } +// Return whether we allow using masked interleave-groups (for dealing with +// strided loads/stores that reside in predicated blocks, or for dealing +// with gaps). +static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { + // If an override option has been passed in for interleaved accesses, use it. + if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0) + return EnableMaskedInterleavedMemAccesses; + + return TTI.enableMaskedInterleavedAccessVectorization(); +} + // Try to vectorize the interleave group that \p Instr belongs to. // // E.g. Translate following interleaved load group (factor = 3): @@ -1990,12 +2016,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, unsigned Index = Group->getIndex(Instr); VectorParts Mask; - bool IsMaskRequired = BlockInMask; - if (IsMaskRequired) { + bool IsMaskForCondRequired = BlockInMask; + if (IsMaskForCondRequired) { Mask = *BlockInMask; // TODO: extend the masked interleaved-group support to reversed access. assert(!Group->isReverse() && "Reversed masked interleave-group " - "not supported."); + "not supported."); } // If the group is reverse, adjust the index to refer to the last vector lane @@ -2036,20 +2062,35 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, setDebugLocFromInst(Builder, Instr); Value *UndefVec = UndefValue::get(VecTy); + Value *MaskForGaps = nullptr; + if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) { + MaskForGaps = createBitMaskForGaps(Builder, VF, *Group); + assert(MaskForGaps && "Mask for Gaps is required but it is null"); + } + // Vectorize the interleaved load group. if (isa<LoadInst>(Instr)) { // For each unroll part, create a wide load for the group. SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoad; - if (IsMaskRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); - auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); - Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); - NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), - ShuffledMask, UndefVec, - "wide.masked.vec"); + if (IsMaskForCondRequired || MaskForGaps) { + assert(useMaskedInterleavedAccesses(*TTI) && + "masked interleaved groups are not allowed."); + Value *GroupMask = MaskForGaps; + if (IsMaskForCondRequired) { + auto *Undefs = UndefValue::get(Mask[Part]->getType()); + auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); + Value *ShuffledMask = Builder.CreateShuffleVector( + Mask[Part], Undefs, RepMask, "interleaved.mask"); + GroupMask = MaskForGaps + ? Builder.CreateBinOp(Instruction::And, ShuffledMask, + MaskForGaps) + : ShuffledMask; + } + NewLoad = + Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + GroupMask, UndefVec, "wide.masked.vec"); } else NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part], @@ -2121,7 +2162,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, "interleaved.vec"); Instruction *NewStoreInstr; - if (IsMaskRequired) { + if (IsMaskForCondRequired) { auto *Undefs = UndefValue::get(Mask[Part]->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( @@ -4333,29 +4374,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return false; } -static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { - if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)) - return TTI.enableMaskedInterleavedAccessVectorization(); - - // If an override option has been passed in for interleaved accesses, use it. - return EnableMaskedInterleavedMemAccesses; -} - bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, unsigned VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); assert(getWideningDecision(I, VF) == CM_Unknown && "Decision should not be set yet."); - - if (!Legal->blockNeedsPredication(I->getParent()) || - !Legal->isMaskRequired(I)) + auto *Group = getInterleavedAccessGroup(I); + assert(Group && "Must have a group."); + + // Check if masking is required. + // A Group may need masking for one of two reasons: it resides in a block that + // needs predication, or it was decided to use masking to deal with gaps. + bool PredicatedAccessRequiresMasking = + Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); + bool AccessWithGapsRequiresMasking = + Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) return true; - if (!useMaskedInterleavedAccesses(TTI)) - return false; + // If masked interleaving is required, we expect that the user/target had + // enabled it, because otherwise it either wouldn't have been created or + // it should have been invalidated by the CostModel. + assert(useMaskedInterleavedAccesses(TTI) && + "Masked interleave-groups for predicated accesses are not enabled."); auto *Ty = getMemInstValueType(I); - return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) + return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) : TTI.isLegalMaskedStore(Ty); } @@ -4606,9 +4650,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { // Record that scalar epilogue is not allowed. LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + IsScalarEpilogueAllowed = !OptForSize; + // We don't create an epilogue when optimizing for size. - // Invalidate interleave groups that require an epilogue. - InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); + // Invalidate interleave groups that require an epilogue if we can't mask + // the interleave-group. + if (!useMaskedInterleavedAccesses(TTI)) + InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); @@ -5495,13 +5543,15 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. + bool UseMaskForGaps = + Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; unsigned Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS, Legal->isMaskRequired(I)); + Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. - assert(!Legal->isMaskRequired(I) && + assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); |