summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp146
2 files changed, 99 insertions, 54 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6ef8dc2d3cd..cede505aed2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -88,6 +88,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
case HK_FORCE:
return (Val <= 1);
case HK_ISVECTORIZED:
+ case HK_PREDICATE:
return (Val == 0 || Val == 1);
}
return false;
@@ -99,7 +100,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
: Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
Force("vectorize.enable", FK_Undefined, HK_FORCE),
- IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) {
+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
+ Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L),
+ ORE(ORE) {
// Populate values with existing loop metadata.
getHintsFromMetadata();
@@ -250,7 +253,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
return;
unsigned Val = C->getZExtValue();
- Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized};
+ Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
for (auto H : Hints) {
if (Name == H->Name) {
if (H->validate(Val))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 62b9ba8d4f9..369b29a1e9b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -839,9 +839,21 @@ namespace llvm {
// Loop vectorization cost-model hints how the scalar epilogue loop should be
// lowered.
enum ScalarEpilogueLowering {
+
+ // The default: allowing scalar epilogues.
CM_ScalarEpilogueAllowed,
+
+ // Vectorization with OptForSize: don't allow epilogues.
CM_ScalarEpilogueNotAllowedOptSize,
- CM_ScalarEpilogueNotAllowedLowTripLoop
+
+ // A special case of vectorisation with OptForSize: loops with a very small
+ // trip count are considered for vectorization under OptForSize, thereby
+ // making sure the cost of their loop body is dominant, free of runtime
+ // guards and scalar iteration overheads.
+ CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+ // Loop hint predicate indicating an epilogue is undesired.
+ CM_ScalarEpilogueNotNeededPredicatePragma
};
/// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -854,22 +866,26 @@ enum ScalarEpilogueLowering {
class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
- PredicatedScalarEvolution &PSE,
- LoopInfo *LI, LoopVectorizationLegality *Legal,
+ PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints,
InterleavedAccessInfo &IAI)
- : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE),
- LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE),
- TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+ Hints(Hints), InterleaveInfo(IAI) {}
/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
Optional<unsigned> computeMaxVF();
+ /// \return True if runtime checks are required for vectorization, and false
+ /// otherwise.
+ bool runtimeChecksRequired();
+
/// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to MaxVF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
@@ -4687,26 +4703,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
Uniforms[VF].insert(Worklist.begin(), Worklist.end());
}
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
- if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
- // TODO: It may by useful to do since it's still likely to be dynamically
- // uniform if the target can skip.
- LLVM_DEBUG(
- dbgs() << "LV: Not inserting runtime ptr check for divergent target");
-
- ORE->emit(
- createMissedAnalysis("CantVersionLoopWithDivergentTarget")
- << "runtime pointer checks needed. Not enabled for divergent target");
-
- return None;
- }
-
- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (isScalarEpilogueAllowed())
- return computeFeasibleMaxVF(TC);
-
- LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" <<
- "LV: Performing code size checks.\n");
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+ LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
if (Legal->getRuntimePointerChecking()->Need) {
ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
@@ -4716,7 +4714,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
LLVM_DEBUG(
dbgs()
<< "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
- return None;
+ return true;
}
if (!PSE.getUnionPredicate().getPredicates().empty()) {
@@ -4727,7 +4725,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
LLVM_DEBUG(
dbgs()
<< "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
- return None;
+ return true;
}
// FIXME: Avoid specializing for stride==1 instead of bailing out.
@@ -4739,12 +4737,28 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
LLVM_DEBUG(
dbgs()
<< "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+ return true;
+ }
+
+ return false;
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+ if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+ // TODO: It may by useful to do since it's still likely to be dynamically
+ // uniform if the target can skip.
+ LLVM_DEBUG(
+ dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+
+ ORE->emit(
+ createMissedAnalysis("CantVersionLoopWithDivergentTarget")
+ << "runtime pointer checks needed. Not enabled for divergent target");
+
return None;
}
- // If we optimize the program for size, avoid creating the tail loop.
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
if (TC == 1) {
ORE->emit(createMissedAnalysis("SingleIterationLoop")
<< "loop trip count is one, irrelevant for vectorization");
@@ -4752,18 +4766,44 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
return None;
}
- // Record that scalar epilogue is not allowed.
- LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+ switch (ScalarEpilogueStatus) {
+ default:
+ return None;
+ case CM_ScalarEpilogueAllowed:
+ return computeFeasibleMaxVF(TC);
+ case CM_ScalarEpilogueNotNeededPredicatePragma:
+ LLVM_DEBUG(
+ dbgs() << "LV: vector predicate hint found.\n"
+ << "LV: Not allowing scalar epilogue, creating predicated "
+ << "vector loop.\n");
+ break;
+ case CM_ScalarEpilogueNotAllowedLowTripLoop:
+ // fallthrough as a special case of OptForSize
+ case CM_ScalarEpilogueNotAllowedOptSize:
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+ LLVM_DEBUG(
+ dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+ else
+ LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+ << "count.\n");
+
+ // Bail if runtime checks are required, which are not good when optimising
+ // for size.
+ if (runtimeChecksRequired())
+ return None;
+ break;
+ }
+
+ // Now try the tail folding
- // We don't create an epilogue when optimizing for size.
// Invalidate interleave groups that require an epilogue if we can't mask
// the interleave-group.
if (!useMaskedInterleavedAccesses(TTI))
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
unsigned MaxVF = computeFeasibleMaxVF(TC);
-
if (TC > 0 && TC % MaxVF == 0) {
+ // Accept MaxVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxVF;
}
@@ -7207,6 +7247,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
}
+static ScalarEpilogueLowering
+getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
+ ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) {
+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+ if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
+ (F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+ SEL = CM_ScalarEpilogueNotAllowedOptSize;
+ else if (Hints.getPredicate())
+ SEL = CM_ScalarEpilogueNotNeededPredicatePragma;
+
+ return SEL;
+}
+
// Process the loop in the VPlan-native vectorization path. This path builds
// VPlan upfront in the vectorization pipeline, which allows to apply
// VPlan-to-VPlan transformations from the very beginning without modifying the
@@ -7221,15 +7275,10 @@ static bool processLoopInVPlanNativePath(
assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
Function *F = L->getHeader()->getParent();
InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
- ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
- if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- (F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
- SEL = CM_ScalarEpilogueNotAllowedOptSize;
-
- LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI,
- DB, AC, ORE, F, &Hints, IAI);
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ &Hints, IAI);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
@@ -7318,11 +7367,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the function attributes and profiles to find out if this function
// should be optimized for size.
- ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
- if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
- (F->hasOptSize() ||
- llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
- SEL = CM_ScalarEpilogueNotAllowedOptSize;
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);
// Entrance to the VPlan-native vectorization path. Outer loops are processed
// here. They may require CFG and instruction level transformations before
@@ -7371,9 +7416,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
else {
LLVM_DEBUG(dbgs() << "\n");
- // Loops with a very small trip count are considered for vectorization
- // under OptForSize, thereby making sure the cost of their loop body is
- // dominant, free of runtime guards and scalar iteration overheads.
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
}
}
@@ -7420,8 +7462,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
}
// Use the cost model.
- LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI,
- DB, AC, ORE, F, &Hints, IAI);
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+ F, &Hints, IAI);
CM.collectValuesToIgnore();
// Use the planner for vectorization.
OpenPOWER on IntegriCloud