summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp225
-rw-r--r--llvm/test/Transforms/LoopVectorize/conditional-assignment.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/hoist-loads.ll3
3 files changed, 127 insertions, 103 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7f45c7a3571..963dad5ff20 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1648,58 +1648,12 @@ public:
bool hasStride(Value *V) { return LAI->hasStride(V); }
- /// Returns true if the target machine supports masked store operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
- return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType);
- }
-
- /// Returns true if the target machine supports masked load operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
- return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
- }
-
- /// Returns true if the target machine supports masked scatter operation
- /// for the given \p DataType.
- bool isLegalMaskedScatter(Type *DataType) {
- return TTI->isLegalMaskedScatter(DataType);
- }
-
- /// Returns true if the target machine supports masked gather operation
- /// for the given \p DataType.
- bool isLegalMaskedGather(Type *DataType) {
- return TTI->isLegalMaskedGather(DataType);
- }
-
- /// Returns true if the target machine can represent \p V as a masked gather
- /// or scatter operation.
- bool isLegalGatherOrScatter(Value *V) {
- auto *LI = dyn_cast<LoadInst>(V);
- auto *SI = dyn_cast<StoreInst>(V);
- if (!LI && !SI)
- return false;
- auto *Ptr = getPointerOperand(V);
- auto *Ty = cast<PointerType>(Ptr->getType())->getElementType();
- return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
- }
-
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }
- unsigned getNumPredStores() const { return NumPredStores; }
-
- /// Returns true if \p I is an instruction that will be scalarized with
- /// predication. Such instructions include conditional stores and
- /// instructions that may divide by zero.
- bool isScalarWithPredication(Instruction *I);
-
- /// Returns true if \p I is a memory instruction with consecutive memory
- /// access that can be widened.
- bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
// Returns true if the NoNaN attribute is set on the function.
bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
@@ -1753,8 +1707,6 @@ private:
return LAI ? &LAI->getSymbolicStrides() : nullptr;
}
- unsigned NumPredStores = 0;
-
/// The loop that we evaluate.
Loop *TheLoop;
@@ -2060,7 +2012,53 @@ public:
collectLoopScalars(VF);
}
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr) {
+ return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType);
+ }
+
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
+ return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType);
+ }
+
+ /// Returns true if the target machine supports masked scatter operation
+ /// for the given \p DataType.
+ bool isLegalMaskedScatter(Type *DataType) {
+ return TTI.isLegalMaskedScatter(DataType);
+ }
+
+ /// Returns true if the target machine supports masked gather operation
+ /// for the given \p DataType.
+ bool isLegalMaskedGather(Type *DataType) {
+ return TTI.isLegalMaskedGather(DataType);
+ }
+
+ /// Returns true if the target machine can represent \p V as a masked gather
+ /// or scatter operation.
+ bool isLegalGatherOrScatter(Value *V) {
+ bool LI = isa<LoadInst>(V);
+ bool SI = isa<StoreInst>(V);
+ if (!LI && !SI)
+ return false;
+ auto *Ty = getMemInstValueType(V);
+ return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty));
+ }
+
+ /// Returns true if \p I is an instruction that will be scalarized with
+ /// predication. Such instructions include conditional stores and
+ /// instructions that may divide by zero.
+ bool isScalarWithPredication(Instruction *I);
+
+ /// Returns true if \p I is a memory instruction with consecutive memory
+ /// access that can be widened.
+ bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
private:
+ unsigned NumPredStores = 0;
+
/// \return An upper bound for the vectorization factor, larger than zero.
/// One is returned if vectorization should best be avoided due to cost.
unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount);
@@ -2112,6 +2110,10 @@ private:
/// as a vector operation.
bool isConsecutiveLoadOrStore(Instruction *I);
+ /// Returns true if an artificially high cost for emulated masked memrefs
+ /// should be used.
+ bool useEmulatedMaskMemRefHack(Instruction *I);
+
/// Create an analysis remark that explains why vectorization failed
///
/// \p RemarkName is the identifier for the remark. \return the remark object
@@ -5421,14 +5423,22 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
Scalars[VF].insert(Worklist.begin(), Worklist.end());
}
-bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
- if (!blockNeedsPredication(I->getParent()))
+bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) {
+ if (!Legal->blockNeedsPredication(I->getParent()))
return false;
switch(I->getOpcode()) {
default:
break;
- case Instruction::Store:
- return !isMaskRequired(I);
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (!Legal->isMaskRequired(I))
+ return false;
+ auto *Ptr = getPointerOperand(I);
+ auto *Ty = getMemInstValueType(I);
+ return isa<LoadInst>(I) ?
+ !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty))
+ : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty));
+ }
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::SRem:
@@ -5438,8 +5448,8 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
return false;
}
-bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
- unsigned VF) {
+bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
+ unsigned VF) {
// Get and ensure we have a valid memory instruction.
LoadInst *LI = dyn_cast<LoadInst>(I);
StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -5448,7 +5458,7 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
auto *Ptr = getPointerOperand(I);
// In order to be widened, the pointer should be consecutive, first of all.
- if (!isConsecutivePtr(Ptr))
+ if (!Legal->isConsecutivePtr(Ptr))
return false;
// If the instruction is a store located in a predicated block, it will be
@@ -5703,39 +5713,26 @@ bool LoopVectorizationLegality::blockCanBePredicated(
if (!LI)
return false;
if (!SafePtrs.count(LI->getPointerOperand())) {
- if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
- isLegalMaskedGather(LI->getType())) {
- MaskedOp.insert(LI);
- continue;
- }
// !llvm.mem.parallel_loop_access implies if-conversion safety.
- if (IsAnnotatedParallel)
- continue;
- return false;
+ // Otherwise, record that the load needs (real or emulated) masking
+ // and let the cost model decide.
+ if (!IsAnnotatedParallel)
+ MaskedOp.insert(LI);
+ continue;
}
}
if (I.mayWriteToMemory()) {
auto *SI = dyn_cast<StoreInst>(&I);
- // We only support predication of stores in basic blocks with one
- // predecessor.
if (!SI)
return false;
-
- // Build a masked store if it is legal for the target.
- if (isLegalMaskedStore(SI->getValueOperand()->getType(),
- SI->getPointerOperand()) ||
- isLegalMaskedScatter(SI->getValueOperand()->getType())) {
- MaskedOp.insert(SI);
- continue;
- }
-
- bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0);
- bool isSinglePredecessor = SI->getParent()->getSinglePredecessor();
-
- if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr ||
- !isSinglePredecessor)
- return false;
+ // Predicated store requires some form of masking:
+ // 1) masked store HW instruction,
+ // 2) emulation via load-blend-store (only if safe and legal to do so,
+ // be aware on the race conditions), or
+ // 3) element-by-element predicate check and scalar store.
+ MaskedOp.insert(SI);
+ continue;
}
if (I.mayThrow())
return false;
@@ -6050,13 +6047,6 @@ void InterleavedAccessInfo::analyzeInterleaving(
}
Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
- if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
- ORE->emit(createMissedAnalysis("ConditionalStore")
- << "store that is conditionally executed prevents vectorization");
- DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
- return None;
- }
-
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
// uniform if the target can skip.
@@ -6183,9 +6173,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
float Cost = expectedCost(1).first;
-#ifndef NDEBUG
const float ScalarCost = Cost;
-#endif /* NDEBUG */
unsigned Width = 1;
DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
@@ -6216,6 +6204,14 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
}
}
+ if (!EnableCondStoresVectorization && NumPredStores) {
+ ORE->emit(createMissedAnalysis("ConditionalStore")
+ << "store that is conditionally executed prevents vectorization");
+ DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+ Width = 1;
+ Cost = ScalarCost;
+ }
+
DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
@@ -6267,7 +6263,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
// optimization to non-pointer types.
//
if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
- !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
+ !Legal->isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
continue;
MinWidth = std::min(MinWidth,
@@ -6592,6 +6588,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
return RUs;
}
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+ // TODO: Cost model for emulated masked load/store is completely
+ // broken. This hack guides the cost model to use an artificially
+ // high enough value to practically disable vectorization with such
+ // operations, except where previously deployed legality hack allowed
+ // using very low cost values. This is to avoid regressions coming simply
+ // from moving "masked load/store" check from legality to cost model.
+ // Masked Load/Gather emulation was previously never allowed.
+ // Limited number of Masked Store/Scatter emulation was allowed.
+ assert(isScalarWithPredication(I) &&
+ "Expecting a scalar emulated instruction");
+ return isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ NumPredStores > NumberOfStoresToPredicate);
+}
+
void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
// If we aren't vectorizing the loop, or if we've already collected the
// instructions to scalarize, there's nothing to do. Collection may already
@@ -6612,11 +6624,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
if (!Legal->blockNeedsPredication(BB))
continue;
for (Instruction &I : *BB)
- if (Legal->isScalarWithPredication(&I)) {
+ if (isScalarWithPredication(&I)) {
ScalarCostsTy ScalarCosts;
- if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+ // Do not apply discount logic if hacked cost is needed
+ // for emulated masked memrefs.
+ if (!useEmulatedMaskMemRefHack(&I) &&
+ computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
-
// Remember that BB will remain after vectorization.
PredicatedBBsAfterVectorization.insert(BB);
}
@@ -6651,7 +6665,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// If the instruction is scalar with predication, it will be analyzed
// separately. We ignore it within the context of PredInst.
- if (Legal->isScalarWithPredication(I))
+ if (isScalarWithPredication(I))
return false;
// If any of the instruction's operands are uniform after vectorization,
@@ -6705,7 +6719,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
true, false);
ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
@@ -6848,9 +6862,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// If we have a predicated store, it may not be executed for each vector
// lane. Scale the cost by the probability of executing the predicated
// block.
- if (Legal->isScalarWithPredication(I))
+ if (isScalarWithPredication(I)) {
Cost /= getReciprocalPredBlockProb();
+ if (useEmulatedMaskMemRefHack(I))
+ // Artificially setting to a high enough value to practically disable
+ // vectorization with such operations.
+ Cost = 3000000;
+ }
+
return Cost;
}
@@ -6975,6 +6995,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (VF == 1)
return;
+ NumPredStores = 0;
for (BasicBlock *BB : TheLoop->blocks()) {
// For each instruction in the old loop.
for (Instruction &I : *BB) {
@@ -6982,6 +7003,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
if (!Ptr)
continue;
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ NumPredStores++;
if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
// Scalar load + broadcast
unsigned Cost = getUniformMemOpCost(&I, VF);
@@ -6990,7 +7013,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
// We assume that widening is the best solution when possible.
- if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+ if (memoryInstructionCanBeWidened(&I, VF)) {
unsigned Cost = getConsecutiveMemOpCost(&I, VF);
int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
@@ -7017,7 +7040,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
}
unsigned GatherScatterCost =
- Legal->isLegalGatherOrScatter(&I)
+ isLegalGatherOrScatter(&I)
? getGatherScatterCost(&I, VF) * NumAccesses
: std::numeric_limits<unsigned>::max();
@@ -7178,7 +7201,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// vector lane. Get the scalarization cost and scale this amount by the
// probability of executing the predicated block. If the instruction is not
// predicated, we fall through to the next case.
- if (VF > 1 && Legal->isScalarWithPredication(I)) {
+ if (VF > 1 && isScalarWithPredication(I)) {
unsigned Cost = 0;
// These instructions have a non-void type, so account for the phi nodes
@@ -7799,7 +7822,7 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) {
bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
VFRange &Range) {
- if (Legal->isScalarWithPredication(I))
+ if (CM.isScalarWithPredication(I))
return false;
auto IsVectorizableOpcode = [](unsigned Opcode) {
@@ -7906,7 +7929,7 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication(
[&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
Range);
- bool IsPredicated = Legal->isScalarWithPredication(I);
+ bool IsPredicated = CM.isScalarWithPredication(I);
auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
// Find if I uses a predicated instruction. If so, it will use its scalar
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
index 25d6111d65a..7d1071b05d8 100644
--- a/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
@@ -1,7 +1,7 @@
; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
-; CHECK: remark: source.c:2:8: loop not vectorized: store that is conditionally executed prevents vectorization
+; CHECK: remark: source.c:2:8: the cost-model indicates that vectorization is not beneficial
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-loads.ll b/llvm/test/Transforms/LoopVectorize/hoist-loads.ll
index a20b0f6cdf3..db4774d8ba9 100644
--- a/llvm/test/Transforms/LoopVectorize/hoist-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/hoist-loads.ll
@@ -37,8 +37,9 @@ for.end:
}
; However, we can't hoist loads whose address we have not seen unconditionally
-; accessed.
+; accessed. One wide load is fine, but not the second.
; CHECK-LABEL: @dont_hoist_cond_load(
+; CHECK: load <2 x float>
; CHECK-NOT: load <2 x float>
define void @dont_hoist_cond_load() {
OpenPOWER on IntegriCloud