diff options
Diffstat (limited to 'llvm/lib/Transforms')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h | 4 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 119 | 
2 files changed, 66 insertions, 57 deletions
| diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 97077cce83e..a5e85f27fab 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -228,11 +228,11 @@ public:    /// Plan how to best vectorize, return the best VF and its cost, or None if    /// vectorization and interleaving should be avoided up front. -  Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF); +  Optional<VectorizationFactor> plan(unsigned UserVF);    /// Use the VPlan-native path to plan how to best vectorize, return the best    /// VF and its cost. -  VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); +  VectorizationFactor planInVPlanNativePath(unsigned UserVF);    /// Finalize the best decision and dispose of all other VPlans.    void setBestPlan(unsigned VF, unsigned UF); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 46265e3f3e1..62b9ba8d4f9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -836,6 +836,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,  namespace llvm { +// Loop vectorization cost-model hints how the scalar epilogue loop should be +// lowered. +enum ScalarEpilogueLowering { +  CM_ScalarEpilogueAllowed, +  CM_ScalarEpilogueNotAllowedOptSize, +  CM_ScalarEpilogueNotAllowedLowTripLoop +}; +  /// LoopVectorizationCostModel - estimates the expected speedups due to  /// vectorization.  /// In many cases vectorization is not profitable. This can happen because of @@ -845,7 +853,8 @@ namespace llvm {  /// different operations.  class LoopVectorizationCostModel {  public: -  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, +  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, +                             PredicatedScalarEvolution &PSE,                               LoopInfo *LI, LoopVectorizationLegality *Legal,                               const TargetTransformInfo &TTI,                               const TargetLibraryInfo *TLI, DemandedBits *DB, @@ -853,12 +862,13 @@ public:                               OptimizationRemarkEmitter *ORE, const Function *F,                               const LoopVectorizeHints *Hints,                               InterleavedAccessInfo &IAI) -      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), -    AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} +      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), +    LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), +    TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}    /// \return An upper bound for the vectorization factor, or None if    /// vectorization and interleaving should be avoided up front. -  Optional<unsigned> computeMaxVF(bool OptForSize); +  Optional<unsigned> computeMaxVF();    /// \return The most profitable vectorization factor and the cost of that VF.    /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -881,8 +891,7 @@ public:    /// If interleave count has been specified by metadata it will be returned.    /// Otherwise, the interleave count is computed and returned. VF and LoopCost    /// are the selected vectorization factor and the cost of the selected VF. -  unsigned selectInterleaveCount(bool OptForSize, unsigned VF, -                                 unsigned LoopCost); +  unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);    /// Memory access instruction may be vectorized in more than one way.    /// Form of instruction after vectorization depends on cost. @@ -1157,11 +1166,14 @@ public:    /// to handle accesses with gaps, and there is nothing preventing us from    /// creating a scalar epilogue.    bool requiresScalarEpilogue() const { -    return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); +    return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();    } -  /// Returns true if a scalar epilogue is not allowed due to optsize. -  bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } +  /// Returns true if a scalar epilogue is not allowed due to optsize or a +  /// loop hint annotation. +  bool isScalarEpilogueAllowed() const { +    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; +  }    /// Returns true if all loop blocks should be masked to fold tail loop.    bool foldTailByMasking() const { return FoldTailByMasking; } @@ -1187,7 +1199,7 @@ private:    /// \return An upper bound for the vectorization factor, larger than zero.    /// One is returned if vectorization should best be avoided due to cost. -  unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); +  unsigned computeFeasibleMaxVF(unsigned ConstTripCount);    /// The vectorization cost is a combination of the cost itself and a boolean    /// indicating whether any of the contributing operations will actually @@ -1270,13 +1282,13 @@ private:    SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;    /// Records whether it is allowed to have the original scalar loop execute at -  /// least once. This may be needed as a fallback loop in case runtime  +  /// least once. This may be needed as a fallback loop in case runtime    /// aliasing/dependence checks fail, or to handle the tail/remainder    /// iterations when the trip count is unknown or doesn't divide by the VF,    /// or as a peel-loop to handle gaps in interleave-groups.    /// Under optsize and when the trip count is very small we don't allow any    /// iterations to execute in the scalar loop. -  bool IsScalarEpilogueAllowed = true; +  ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;    /// All blocks of loop are to be masked to fold tail of scalar iterations.    bool FoldTailByMasking = false; @@ -4452,10 +4464,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,    // Check if masking is required.    // A Group may need masking for one of two reasons: it resides in a block that    // needs predication, or it was decided to use masking to deal with gaps. -  bool PredicatedAccessRequiresMasking =  +  bool PredicatedAccessRequiresMasking =        Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); -  bool AccessWithGapsRequiresMasking =  -      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; +  bool AccessWithGapsRequiresMasking = +      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();    if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)      return true; @@ -4675,7 +4687,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {    Uniforms[VF].insert(Worklist.begin(), Worklist.end());  } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {    if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {      // TODO: It may by useful to do since it's still likely to be dynamically      // uniform if the target can skip. @@ -4690,8 +4702,11 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {    }    unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); -  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. -    return computeFeasibleMaxVF(OptForSize, TC); +  if (isScalarEpilogueAllowed()) +    return computeFeasibleMaxVF(TC); + +  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" << +                       "LV: Performing code size checks.\n");    if (Legal->getRuntimePointerChecking()->Need) {      ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") @@ -4740,15 +4755,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {    // Record that scalar epilogue is not allowed.    LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); -  IsScalarEpilogueAllowed = !OptForSize; -    // We don't create an epilogue when optimizing for size.    // Invalidate interleave groups that require an epilogue if we can't mask    // the interleave-group. -  if (!useMaskedInterleavedAccesses(TTI))  +  if (!useMaskedInterleavedAccesses(TTI))      InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); -  unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); +  unsigned MaxVF = computeFeasibleMaxVF(TC);    if (TC > 0 && TC % MaxVF == 0) {      LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); @@ -4779,8 +4792,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {  }  unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, -                                                 unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {    MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);    unsigned SmallestType, WidestType;    std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4818,8 +4830,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,    }    unsigned MaxVF = MaxVectorSize; -  if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || -      (MaximizeBandwidth && !OptForSize)) { +  if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || +      (MaximizeBandwidth && isScalarEpilogueAllowed())) {      // Collect all viable vectorization factors larger than the default MaxVF      // (i.e. MaxVectorSize).      SmallVector<unsigned, 8> VFs; @@ -4958,8 +4970,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {    return {MinWidth, MaxWidth};  } -unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, -                                                           unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,                                                             unsigned LoopCost) {    // -- The interleave heuristics --    // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -4975,8 +4986,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,    // 3. We don't interleave if we think that we will spill registers to memory    // due to the increased register pressure. -  // When we optimize for size, we don't interleave. -  if (OptForSize) +  if (!isScalarEpilogueAllowed())      return 1;    // We used the distance for the interleave count. @@ -5626,8 +5636,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,    }    // Calculate the cost of the whole interleaved group. -  bool UseMaskForGaps =  -      Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; +  bool UseMaskForGaps = +      Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();    unsigned Cost = TTI.getInterleavedMemoryOpCost(        I->getOpcode(), WideVecTy, Group->getFactor(), Indices,        Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); @@ -6167,8 +6177,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,  }  VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, -                                                unsigned UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {    unsigned VF = UserVF;    // Outer loop handling: They may require CFG and instruction level    // transformations before even evaluating whether vectorization is profitable. @@ -6207,10 +6216,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,    return VectorizationFactor::Disabled();  } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize, -                                                             unsigned UserVF) { +Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {    assert(OrigLoop->empty() && "Inner loop expected."); -  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); +  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();    if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.      return None; @@ -7213,8 +7221,15 @@ static bool processLoopInVPlanNativePath(    assert(EnableVPlanNativePath && "VPlan-native path is disabled.");    Function *F = L->getHeader()->getParent();    InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); -  LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, -                                &Hints, IAI); + +  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; +  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && +      (F->hasOptSize() || +       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) +    SEL = CM_ScalarEpilogueNotAllowedOptSize; + +  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, +                                DB, AC, ORE, F, &Hints, IAI);    // Use the planner for outer loop vectorization.    // TODO: CM is not used at this point inside the planner. Turn CM into an    // optional argument if we don't need it in the future. @@ -7223,15 +7238,8 @@ static bool processLoopInVPlanNativePath(    // Get user vectorization factor.    const unsigned UserVF = Hints.getWidth(); -  // Check the function attributes and profiles to find out if this function -  // should be optimized for size. -  bool OptForSize = -      Hints.getForce() != LoopVectorizeHints::FK_Enabled && -      (F->hasOptSize() || -       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); -    // Plan how to best vectorize, return the best VF and its cost. -  const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); +  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);    // If we are stress testing VPlan builds, do not attempt to generate vector    // code. Masked vector code generation support will follow soon. @@ -7310,10 +7318,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {    // Check the function attributes and profiles to find out if this function    // should be optimized for size. -  bool OptForSize = -      Hints.getForce() != LoopVectorizeHints::FK_Enabled && +  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; +  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&        (F->hasOptSize() || -       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); +       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) +    SEL = CM_ScalarEpilogueNotAllowedOptSize;    // Entrance to the VPlan-native vectorization path. Outer loops are processed    // here. They may require CFG and instruction level transformations before @@ -7365,7 +7374,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {        // Loops with a very small trip count are considered for vectorization        // under OptForSize, thereby making sure the cost of their loop body is        // dominant, free of runtime guards and scalar iteration overheads. -      OptForSize = true; +      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;      }    } @@ -7411,8 +7420,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {    }    // Use the cost model. -  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, -                                &Hints, IAI); +  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, +                                DB, AC, ORE, F, &Hints, IAI);    CM.collectValuesToIgnore();    // Use the planner for vectorization. @@ -7422,7 +7431,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {    unsigned UserVF = Hints.getWidth();    // Plan how to best vectorize, return the best VF and its cost. -  Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF); +  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);    VectorizationFactor VF = VectorizationFactor::Disabled();    unsigned IC = 1; @@ -7431,7 +7440,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {    if (MaybeVF) {      VF = *MaybeVF;      // Select the interleave count. -    IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); +    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);    }    // Identify the diagnostic messages that should be produced. | 

