summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorDorit Nuzman <dorit.nuzman@intel.com>2018-10-31 09:57:56 +0000
committerDorit Nuzman <dorit.nuzman@intel.com>2018-10-31 09:57:56 +0000
commit34da6dd696439e195e7b650d97a95913101a88d9 (patch)
treea718d6a89ceb39ada3675f96f8de45c051e8ce7f /llvm/lib
parent889356eb719ded45c708514fb03777f705eb5934 (diff)
downloadbcm5719-llvm-34da6dd696439e195e7b650d97a95913101a88d9.tar.gz
bcm5719-llvm-34da6dd696439e195e7b650d97a95913101a88d9.zip
[LV] Support vectorization of interleave-groups that require an epilog under
optsize using masked wide loads Under Opt for Size, the vectorizer does not vectorize interleave-groups that have gaps at the end of the group (such as a loop that reads only the even elements: a[2*i]) because that implies that we'll require a scalar epilogue (which is not allowed under Opt for Size). This patch extends the support for masked-interleave-groups (introduced by D53011 for conditional accesses) to also cover the case of gaps in a group of loads; Targets that enable the masked-interleave-group feature don't have to invalidate interleave-groups of loads with gaps; they could now use masked wide-loads and shuffles (if that's what the cost model selects). Reviewers: Ayal, hsaito, dcaballe, fhahn Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D53668 llvm-svn: 345705
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp9
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp24
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp9
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp8
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp8
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp8
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp8
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h4
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp28
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h9
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp114
15 files changed, 173 insertions, 70 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 867403d0ef1..6e4eb8ff0cd 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -519,9 +519,12 @@ int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
int TargetTransformInfo::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace, bool IsMasked) const {
- int Cost = TTIImpl->getInterleavedMemoryOpCost(
- Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, IsMasked);
+ unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+ bool UseMaskForGaps) const {
+ int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+ Alignment, AddressSpace,
+ UseMaskForCond,
+ UseMaskForGaps);
assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 8b6702c8544..38dca50e82a 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -504,6 +504,25 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
return Inst;
}
+Constant *llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+ const InterleaveGroup &Group) {
+ // All 1's means mask is not needed.
+ if (Group.getNumMembers() == Group.getFactor())
+ return nullptr;
+
+ // TODO: support reversed access.
+ assert(!Group.isReverse() && "Reversed group not supported.");
+
+ SmallVector<Constant *, 16> Mask;
+ for (unsigned i = 0; i < VF; i++)
+ for (unsigned j = 0; j < Group.getFactor(); ++j) {
+ unsigned HasMember = Group.getMember(j) ? 1 : 0;
+ Mask.push_back(Builder.getInt1(HasMember));
+ }
+
+ return ConstantVector::get(Mask);
+}
+
Constant *llvm::createReplicatedMask(IRBuilder<> &Builder,
unsigned ReplicationFactor, unsigned VF) {
SmallVector<Constant *, 16> MaskVec;
@@ -935,9 +954,10 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
}
for (auto *Ptr : DelSet) {
LLVM_DEBUG(
- dbgs()
+ dbgs()
<< "LV: Invalidate candidate interleaved group due to gaps that "
- "require a scalar epilogue.\n");
+ "require a scalar epilogue (not allowed under optsize) and cannot "
+ "be masked (not enabled). \n");
releaseGroup(Ptr);
}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 77c83970f68..a256cb7c921 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -660,11 +660,13 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked) {
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
- if (!IsMasked && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+ if (!UseMaskForCond && !UseMaskForGaps &&
+ Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
@@ -677,7 +679,8 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
}
int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b3893d32850..08c1a892422 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -146,7 +146,9 @@ public:
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace, bool IsMasked = false);
+ unsigned AddressSpace,
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
bool
shouldConsiderAddressTypePromotion(const Instruction &I,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index a07c1e83a3f..f72bb8632eb 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -564,7 +564,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked) {
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
assert(Factor >= 2 && "Invalid interleave factor");
assert(isa<VectorType>(VecTy) && "Expect a vector type");
@@ -572,7 +573,7 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
- !IsMasked) {
+ !UseMaskForCond && !UseMaskForGaps) {
unsigned NumElts = VecTy->getVectorNumElements();
auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
@@ -585,7 +586,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 84e3055c6bc..2dd143d48a1 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -169,7 +169,9 @@ public:
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace, bool IsMasked);
+ unsigned AddressSpace,
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 79b269bccfe..4c671460c90 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -206,10 +206,12 @@ unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace, bool IsMasked) {
- if (Indices.size() != Factor || IsMasked)
+ unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
+ bool UseMaskForGaps) {
+ if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
return getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, nullptr);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 901a91692e8..5c6f85584ec 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -123,7 +123,8 @@ public:
bool VariableMask, unsigned Alignment);
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
- unsigned AddressSpace, bool IsMasked);
+ unsigned AddressSpace, bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I);
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f67bacc87ec..bc9bcab83a0 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -474,10 +474,12 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked) {
- if (IsMasked)
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
+ if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 252d46e7a2a..9221a910288 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -91,7 +91,8 @@ public:
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked = false);
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
/// @}
};
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index caa3f597445..94db56e3738 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -969,10 +969,12 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked) {
- if (IsMasked)
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
+ if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
assert(isa<VectorType>(VecTy) &&
"Expect a vector type for interleaved memory op");
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 347a8a632f0..406f075c8a6 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -93,7 +93,9 @@ public:
unsigned Factor,
ArrayRef<unsigned> Indices,
unsigned Alignment,
- unsigned AddressSpace, bool IsMasked = false);
+ unsigned AddressSpace,
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
/// @}
};
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8d8bc0b35cb..ebb8aca5fb1 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2784,11 +2784,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked) {
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
- if (IsMasked)
+ if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
// We currently Support only fully-interleaved groups, with no gaps.
// TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2898,11 +2900,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked) {
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
- if (IsMasked)
+ if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -3021,7 +3025,8 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace,
- bool IsMasked) {
+ bool UseMaskForCond,
+ bool UseMaskForGaps) {
auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
Type *EltTy = VecTy->getVectorElementType();
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -3033,11 +3038,14 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
};
if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
if (ST->hasAVX2())
return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace, IsMasked);
+ Alignment, AddressSpace,
+ UseMaskForCond, UseMaskForGaps);
}
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 2bd778a4211..1637592c81f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -102,15 +102,18 @@ public:
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
- bool IsMasked = false);
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
- bool IsMasked = false);
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
unsigned Alignment, unsigned AddressSpace,
- bool IsMasked = false);
+ bool UseMaskForCond = false,
+ bool UseMaskForGaps = false);
int getIntImmCost(int64_t);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ffa6b242e00..23d4a6b2166 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,8 @@ static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
+/// An interleave-group may need masking if it resides in a block that needs
+/// predication, or in order to mask away gaps.
static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
"enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
@@ -1134,11 +1136,15 @@ public:
}
/// Returns true if an interleaved group requires a scalar iteration
- /// to handle accesses with gaps.
+ /// to handle accesses with gaps, and there is nothing preventing us from
+ /// creating a scalar epilogue.
bool requiresScalarEpilogue() const {
- return InterleaveInfo.requiresScalarEpilogue();
+ return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue();
}
+ /// Returns true if a scalar epilogue is not allowed due to optsize.
+ bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const { return FoldTailByMasking; }
@@ -1229,6 +1235,15 @@ private:
/// vectorization as a predicated block.
SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+ /// Records whether it is allowed to have the original scalar loop execute at
+ /// least once. This may be needed as a fallback loop in case runtime
+ /// aliasing/dependence checks fail, or to handle the tail/remainder
+ /// iterations when the trip count is unknown or doesn't divide by the VF,
+ /// or as a peel-loop to handle gaps in interleave-groups.
+ /// Under optsize and when the trip count is very small we don't allow any
+ /// iterations to execute in the scalar loop.
+ bool IsScalarEpilogueAllowed = true;
+
/// All blocks of loop are to be masked to fold tail of scalar iterations.
bool FoldTailByMasking = false;
@@ -1938,6 +1953,17 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
"reverse");
}
+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+ return EnableMaskedInterleavedMemAccesses;
+
+ return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
// Try to vectorize the interleave group that \p Instr belongs to.
//
// E.g. Translate following interleaved load group (factor = 3):
@@ -1990,12 +2016,12 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
unsigned Index = Group->getIndex(Instr);
VectorParts Mask;
- bool IsMaskRequired = BlockInMask;
- if (IsMaskRequired) {
+ bool IsMaskForCondRequired = BlockInMask;
+ if (IsMaskForCondRequired) {
Mask = *BlockInMask;
// TODO: extend the masked interleaved-group support to reversed access.
assert(!Group->isReverse() && "Reversed masked interleave-group "
- "not supported.");
+ "not supported.");
}
// If the group is reverse, adjust the index to refer to the last vector lane
@@ -2036,20 +2062,35 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
setDebugLocFromInst(Builder, Instr);
Value *UndefVec = UndefValue::get(VecTy);
+ Value *MaskForGaps = nullptr;
+ if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
+ MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
+ assert(MaskForGaps && "Mask for Gaps is required but it is null");
+ }
+
// Vectorize the interleaved load group.
if (isa<LoadInst>(Instr)) {
// For each unroll part, create a wide load for the group.
SmallVector<Value *, 2> NewLoads;
for (unsigned Part = 0; Part < UF; Part++) {
Instruction *NewLoad;
- if (IsMaskRequired) {
- auto *Undefs = UndefValue::get(Mask[Part]->getType());
- auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
- Value *ShuffledMask = Builder.CreateShuffleVector(
- Mask[Part], Undefs, RepMask, "interleaved.mask");
- NewLoad = Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
- ShuffledMask, UndefVec,
- "wide.masked.vec");
+ if (IsMaskForCondRequired || MaskForGaps) {
+ assert(useMaskedInterleavedAccesses(*TTI) &&
+ "masked interleaved groups are not allowed.");
+ Value *GroupMask = MaskForGaps;
+ if (IsMaskForCondRequired) {
+ auto *Undefs = UndefValue::get(Mask[Part]->getType());
+ auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
+ Mask[Part], Undefs, RepMask, "interleaved.mask");
+ GroupMask = MaskForGaps
+ ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+ MaskForGaps)
+ : ShuffledMask;
+ }
+ NewLoad =
+ Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(),
+ GroupMask, UndefVec, "wide.masked.vec");
}
else
NewLoad = Builder.CreateAlignedLoad(NewPtrs[Part],
@@ -2121,7 +2162,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
"interleaved.vec");
Instruction *NewStoreInstr;
- if (IsMaskRequired) {
+ if (IsMaskForCondRequired) {
auto *Undefs = UndefValue::get(Mask[Part]->getType());
auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
Value *ShuffledMask = Builder.CreateShuffleVector(
@@ -4333,29 +4374,32 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
return false;
}
-static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
- if (!(EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0))
- return TTI.enableMaskedInterleavedAccessVectorization();
-
- // If an override option has been passed in for interleaved accesses, use it.
- return EnableMaskedInterleavedMemAccesses;
-}
-
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
unsigned VF) {
assert(isAccessInterleaved(I) && "Expecting interleaved access.");
assert(getWideningDecision(I, VF) == CM_Unknown &&
"Decision should not be set yet.");
-
- if (!Legal->blockNeedsPredication(I->getParent()) ||
- !Legal->isMaskRequired(I))
+ auto *Group = getInterleavedAccessGroup(I);
+ assert(Group && "Must have a group.");
+
+ // Check if masking is required.
+ // A Group may need masking for one of two reasons: it resides in a block that
+ // needs predication, or it was decided to use masking to deal with gaps.
+ bool PredicatedAccessRequiresMasking =
+ Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+ bool AccessWithGapsRequiresMasking =
+ Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
+ if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
return true;
- if (!useMaskedInterleavedAccesses(TTI))
- return false;
+ // If masked interleaving is required, we expect that the user/target had
+ // enabled it, because otherwise it either wouldn't have been created or
+ // it should have been invalidated by the CostModel.
+ assert(useMaskedInterleavedAccesses(TTI) &&
+ "Masked interleave-groups for predicated accesses are not enabled.");
auto *Ty = getMemInstValueType(I);
- return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
+ return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty)
: TTI.isLegalMaskedStore(Ty);
}
@@ -4606,9 +4650,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
// Record that scalar epilogue is not allowed.
LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+ IsScalarEpilogueAllowed = !OptForSize;
+
// We don't create an epilogue when optimizing for size.
- // Invalidate interleave groups that require an epilogue.
- InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+ // Invalidate interleave groups that require an epilogue if we can't mask
+ // the interleave-group.
+ if (!useMaskedInterleavedAccesses(TTI))
+ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC);
@@ -5495,13 +5543,15 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
}
// Calculate the cost of the whole interleaved group.
+ bool UseMaskForGaps =
+ Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed;
unsigned Cost = TTI.getInterleavedMemoryOpCost(
I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
- Group->getAlignment(), AS, Legal->isMaskRequired(I));
+ Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
if (Group->isReverse()) {
// TODO: Add support for reversed masked interleaved access.
- assert(!Legal->isMaskRequired(I) &&
+ assert(!Legal->isMaskRequired(I) &&
"Reverse masked interleaved access not supported.");
Cost += Group->getNumMembers() *
TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
OpenPOWER on IntegriCloud