diff options
4 files changed, 183 insertions, 55 deletions
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index b144006e262..83166775c98 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -55,7 +55,8 @@ OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName,  /// for example 'force', means a decision has been made. So, we need to be  /// careful NOT to add them if the user hasn't specifically asked so.  class LoopVectorizeHints { -  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED }; +  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED, +                  HK_PREDICATE };    /// Hint - associates name and validation with the hint value.    struct Hint { @@ -81,6 +82,9 @@ class LoopVectorizeHints {    /// Already Vectorized    Hint IsVectorized; +  /// Vector Predicate +  Hint Predicate; +    /// Return the loop metadata prefix.    static StringRef Prefix() { return "llvm.loop."; } @@ -109,6 +113,7 @@ public:    unsigned getWidth() const { return Width.Value; }    unsigned getInterleave() const { return Interleave.Value; }    unsigned getIsVectorized() const { return IsVectorized.Value; } +  unsigned getPredicate() const { return Predicate.Value; }    enum ForceKind getForce() const {      if ((ForceKind)Force.Value == FK_Undefined &&          hasDisableAllTransformsHint(TheLoop)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 6ef8dc2d3cd..cede505aed2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -88,6 +88,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {    case HK_FORCE:      return (Val <= 1);    case HK_ISVECTORIZED: +  case HK_PREDICATE:      return (Val == 0 || Val == 1);    }    return false; @@ -99,7 +100,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,      : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),        Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),        Force("vectorize.enable", FK_Undefined, HK_FORCE), -      IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { +      IsVectorized("isvectorized", 0, HK_ISVECTORIZED), +      Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L), +      ORE(ORE) {    // Populate values with existing loop metadata.    getHintsFromMetadata(); @@ -250,7 +253,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {      return;    unsigned Val = C->getZExtValue(); -  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; +  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};    for (auto H : Hints) {      if (Name == H->Name) {        if (H->validate(Val)) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 62b9ba8d4f9..369b29a1e9b 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -839,9 +839,21 @@ namespace llvm {  // Loop vectorization cost-model hints how the scalar epilogue loop should be  // lowered.  enum ScalarEpilogueLowering { + +  // The default: allowing scalar epilogues.    CM_ScalarEpilogueAllowed, + +  // Vectorization with OptForSize: don't allow epilogues.    CM_ScalarEpilogueNotAllowedOptSize, -  CM_ScalarEpilogueNotAllowedLowTripLoop + +  // A special case of vectorisation with OptForSize: loops with a very small +  // trip count are considered for vectorization under OptForSize, thereby +  // making sure the cost of their loop body is dominant, free of runtime +  // guards and scalar iteration overheads. +  CM_ScalarEpilogueNotAllowedLowTripLoop, + +  // Loop hint predicate indicating an epilogue is undesired. +  CM_ScalarEpilogueNotNeededPredicatePragma  };  /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -854,22 +866,26 @@ enum ScalarEpilogueLowering {  class LoopVectorizationCostModel {  public:    LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, -                             PredicatedScalarEvolution &PSE, -                             LoopInfo *LI, LoopVectorizationLegality *Legal, +                             PredicatedScalarEvolution &PSE, LoopInfo *LI, +                             LoopVectorizationLegality *Legal,                               const TargetTransformInfo &TTI,                               const TargetLibraryInfo *TLI, DemandedBits *DB,                               AssumptionCache *AC,                               OptimizationRemarkEmitter *ORE, const Function *F,                               const LoopVectorizeHints *Hints,                               InterleavedAccessInfo &IAI) -      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), -    LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), -    TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} +      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), +        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), +        Hints(Hints), InterleaveInfo(IAI) {}    /// \return An upper bound for the vectorization factor, or None if    /// vectorization and interleaving should be avoided up front.    Optional<unsigned> computeMaxVF(); +  /// \return True if runtime checks are required for vectorization, and false +  /// otherwise. +  bool runtimeChecksRequired(); +    /// \return The most profitable vectorization factor and the cost of that VF.    /// This method checks every power of two up to MaxVF. If UserVF is not ZERO    /// then this vectorization factor will be selected if vectorization is @@ -4687,26 +4703,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {    Uniforms[VF].insert(Worklist.begin(), Worklist.end());  } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { -  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { -    // TODO: It may by useful to do since it's still likely to be dynamically -    // uniform if the target can skip. -    LLVM_DEBUG( -        dbgs() << "LV: Not inserting runtime ptr check for divergent target"); - -    ORE->emit( -      createMissedAnalysis("CantVersionLoopWithDivergentTarget") -      << "runtime pointer checks needed. Not enabled for divergent target"); - -    return None; -  } - -  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); -  if (isScalarEpilogueAllowed()) -    return computeFeasibleMaxVF(TC); - -  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" << -                       "LV: Performing code size checks.\n"); +bool LoopVectorizationCostModel::runtimeChecksRequired() { +  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");    if (Legal->getRuntimePointerChecking()->Need) {      ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") @@ -4716,7 +4714,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {      LLVM_DEBUG(          dbgs()          << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); -    return None; +    return true;    }    if (!PSE.getUnionPredicate().getPredicates().empty()) { @@ -4727,7 +4725,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {      LLVM_DEBUG(          dbgs()          << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); -    return None; +    return true;    }    // FIXME: Avoid specializing for stride==1 instead of bailing out. @@ -4739,12 +4737,28 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {      LLVM_DEBUG(          dbgs()          << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); +    return true; +  } + +  return false; +} + +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { +  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { +    // TODO: It may by useful to do since it's still likely to be dynamically +    // uniform if the target can skip. +    LLVM_DEBUG( +        dbgs() << "LV: Not inserting runtime ptr check for divergent target"); + +    ORE->emit( +      createMissedAnalysis("CantVersionLoopWithDivergentTarget") +      << "runtime pointer checks needed. Not enabled for divergent target"); +      return None;    } -  // If we optimize the program for size, avoid creating the tail loop. +  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);    LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); -    if (TC == 1) {      ORE->emit(createMissedAnalysis("SingleIterationLoop")                << "loop trip count is one, irrelevant for vectorization"); @@ -4752,18 +4766,44 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {      return None;    } -  // Record that scalar epilogue is not allowed. -  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); +  switch (ScalarEpilogueStatus) { +  default: +    return None; +  case CM_ScalarEpilogueAllowed: +    return computeFeasibleMaxVF(TC); +  case CM_ScalarEpilogueNotNeededPredicatePragma: +    LLVM_DEBUG( +        dbgs() << "LV: vector predicate hint found.\n" +               << "LV: Not allowing scalar epilogue, creating predicated " +               << "vector loop.\n"); +    break; +  case CM_ScalarEpilogueNotAllowedLowTripLoop: +    // fallthrough as a special case of OptForSize +  case CM_ScalarEpilogueNotAllowedOptSize: +    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) +      LLVM_DEBUG( +          dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); +    else +      LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " +                        << "count.\n"); + +    // Bail if runtime checks are required, which are not good when optimising +    // for size. +    if (runtimeChecksRequired()) +      return None; +    break; +  } + +  // Now try the tail folding -  // We don't create an epilogue when optimizing for size.    // Invalidate interleave groups that require an epilogue if we can't mask    // the interleave-group.    if (!useMaskedInterleavedAccesses(TTI))      InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();    unsigned MaxVF = computeFeasibleMaxVF(TC); -    if (TC > 0 && TC % MaxVF == 0) { +    // Accept MaxVF if we do not have a tail.      LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");      return MaxVF;    } @@ -7207,6 +7247,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {    State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);  } +static ScalarEpilogueLowering +getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, +                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { +  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; +  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && +      (F->hasOptSize() || +       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) +    SEL = CM_ScalarEpilogueNotAllowedOptSize; +  else if (Hints.getPredicate()) +    SEL = CM_ScalarEpilogueNotNeededPredicatePragma; + +  return SEL; +} +  // Process the loop in the VPlan-native vectorization path. This path builds  // VPlan upfront in the vectorization pipeline, which allows to apply  // VPlan-to-VPlan transformations from the very beginning without modifying the @@ -7221,15 +7275,10 @@ static bool processLoopInVPlanNativePath(    assert(EnableVPlanNativePath && "VPlan-native path is disabled.");    Function *F = L->getHeader()->getParent();    InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); +  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); -  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; -  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && -      (F->hasOptSize() || -       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) -    SEL = CM_ScalarEpilogueNotAllowedOptSize; - -  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, -                                DB, AC, ORE, F, &Hints, IAI); +  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, +                                &Hints, IAI);    // Use the planner for outer loop vectorization.    // TODO: CM is not used at this point inside the planner. Turn CM into an    // optional argument if we don't need it in the future. @@ -7318,11 +7367,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {    // Check the function attributes and profiles to find out if this function    // should be optimized for size. -  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; -  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && -      (F->hasOptSize() || -       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) -    SEL = CM_ScalarEpilogueNotAllowedOptSize; +  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI);    // Entrance to the VPlan-native vectorization path. Outer loops are processed    // here. They may require CFG and instruction level transformations before @@ -7371,9 +7416,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {        LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");      else {        LLVM_DEBUG(dbgs() << "\n"); -      // Loops with a very small trip count are considered for vectorization -      // under OptForSize, thereby making sure the cost of their loop body is -      // dominant, free of runtime guards and scalar iteration overheads.        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;      }    } @@ -7420,8 +7462,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {    }    // Use the cost model. -  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, -                                DB, AC, ORE, F, &Hints, IAI); +  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, +                                F, &Hints, IAI);    CM.collectValuesToIgnore();    // Use the planner for vectorization. diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll new file mode 100644 index 00000000000..d7767385c78 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -0,0 +1,78 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { +; CHECK-LABEL: tail_folding_enabled( +; CHECK:  vector.body: +; CHECK:  %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; CHECK:  %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32( +; CHECK:  %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load +; CHECK:  call void @llvm.masked.store.v8i32.p0v8i32( +; CHECK:  %index.next = add i64 %index, 8 +; CHECK:  %12 = icmp eq i64 %index.next, 432 +; CHECK:  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0 + +entry: +  br label %for.body + +for.cond.cleanup: +  ret void + +for.body: +  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv +  %0 = load i32, i32* %arrayidx, align 4 +  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv +  %1 = load i32, i32* %arrayidx2, align 4 +  %add = add nsw i32 %1, %0 +  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv +  store i32 %add, i32* %arrayidx4, align 4 +  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +  %exitcond = icmp eq i64 %indvars.iv.next, 430 +  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6 +} + +define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 { +; CHECK-LABEL: tail_folding_disabled( +; CHECK:      vector.body: +; CHECK-NOT:  @llvm.masked.load.v8i32.p0v8i32( +; CHECK-NOT:  @llvm.masked.store.v8i32.p0v8i32( +; CHECK:      br i1 %44, label {{.*}}, label %vector.body +entry: +  br label %for.body + +for.cond.cleanup: +  ret void + +for.body: +  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv +  %0 = load i32, i32* %arrayidx, align 4 +  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv +  %1 = load i32, i32* %arrayidx2, align 4 +  %add = add nsw i32 %1, %0 +  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv +  store i32 %add, i32* %arrayidx4, align 4 +  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +  %exitcond = icmp eq i64 %indvars.iv.next, 430 +  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 +} + +; CHECK:      !0 = distinct !{!0, !1} +; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-NEXT: !2 = distinct !{!2, !3, !1} +; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-NEXT: !4 = distinct !{!4, !1} +; CHECK-NEXT: !5 = distinct !{!5, !3, !1} + +attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" } + +!6 = distinct !{!6, !7, !8} +!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!8 = !{!"llvm.loop.vectorize.enable", i1 true} + +!10 = distinct !{!10, !11, !12} +!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} +!12 = !{!"llvm.loop.vectorize.enable", i1 true}  | 

