diff options
| author | Vasileios Porpodas <vasileios.porpodas@intel.com> | 2019-06-26 21:25:24 +0000 | 
|---|---|---|
| committer | Vasileios Porpodas <vasileios.porpodas@intel.com> | 2019-06-26 21:25:24 +0000 | 
| commit | 574cb0eb3a7ac95e62d223a60bef891171dfe321 (patch) | |
| tree | 67d843d903e12b5beac057c17c3f6f00048f216d /llvm/lib/Transforms/Vectorize | |
| parent | b5999f17d4fc5f909809cf997215f400973a536a (diff) | |
| download | bcm5719-llvm-574cb0eb3a7ac95e62d223a60bef891171dfe321.tar.gz bcm5719-llvm-574cb0eb3a7ac95e62d223a60bef891171dfe321.zip | |
[SLP] Look-ahead operand reordering heuristic.
Summary: This patch introduces a new heuristic for guiding operand reordering. The new "look-ahead" heuristic can look beyond the immediate predecessors. This helps break ties when the immediate predecessors have identical opcodes (see lit test for an example).
Reviewers: RKSimon, ABataev, dtemirbulatov, Ayal, hfinkel, rnk
Reviewed By: RKSimon, dtemirbulatov
Subscribers: rnk, rcorcs, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D60897
llvm-svn: 364478
Diffstat (limited to 'llvm/lib/Transforms/Vectorize')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 282 | 
1 files changed, 236 insertions, 46 deletions
| diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c47f6980a36..b6065c8d5eb 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -147,6 +147,12 @@ static cl::opt<unsigned> MinTreeSize(      "slp-min-tree-size", cl::init(3), cl::Hidden,      cl::desc("Only vectorize small trees if they are fully vectorizable")); +// The maximum depth that the look-ahead score heuristic will explore. +// The higher this value, the higher the compilation time overhead. +static cl::opt<int> LookAheadMaxDepth( +    "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, +    cl::desc("The maximum look-ahead depth for operand reordering scores")); +  static cl::opt<bool>      ViewSLPTree("view-slp-tree", cl::Hidden,                  cl::desc("Display the SLP trees with Graphviz")); @@ -708,6 +714,7 @@ public:      const DataLayout &DL;      ScalarEvolution &SE; +    const BoUpSLP &R;      /// \returns the operand data at \p OpIdx and \p Lane.      OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -733,6 +740,211 @@ public:        std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);      } +    // The hard-coded scores listed here are not very important. When computing +    // the scores of matching one sub-tree with another, we are basically +    // counting the number of values that are matching. So even if all scores +    // are set to 1, we would still get a decent matching result. +    // However, sometimes we have to break ties. For example we may have to +    // choose between matching loads vs matching opcodes. This is what these +    // scores are helping us with: they provide the order of preference. + +    /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). +    static const int ScoreConsecutiveLoads = 3; +    /// Constants. +    static const int ScoreConstants = 2; +    /// Instructions with the same opcode. +    static const int ScoreSameOpcode = 2; +    /// Instructions with alt opcodes (e.g, add + sub). +    static const int ScoreAltOpcodes = 1; +    /// Identical instructions (a.k.a. splat or broadcast). +    static const int ScoreSplat = 1; +    /// Matching with an undef is preferable to failing. +    static const int ScoreUndef = 1; +    /// Score for failing to find a decent match. +    static const int ScoreFail = 0; +    /// User exteranl to the vectorized code. +    static const int ExternalUseCost = 1; +    /// The user is internal but in a different lane. +    static const int UserInDiffLaneCost = ExternalUseCost; + +    /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. +    static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, +                               ScalarEvolution &SE) { +      auto *LI1 = dyn_cast<LoadInst>(V1); +      auto *LI2 = dyn_cast<LoadInst>(V2); +      if (LI1 && LI2) +        return isConsecutiveAccess(LI1, LI2, DL, SE) +                   ? VLOperands::ScoreConsecutiveLoads +                   : VLOperands::ScoreFail; + +      auto *C1 = dyn_cast<Constant>(V1); +      auto *C2 = dyn_cast<Constant>(V2); +      if (C1 && C2) +        return VLOperands::ScoreConstants; + +      auto *I1 = dyn_cast<Instruction>(V1); +      auto *I2 = dyn_cast<Instruction>(V2); +      if (I1 && I2) { +        if (I1 == I2) +          return VLOperands::ScoreSplat; +        InstructionsState S = getSameOpcode({I1, I2}); +        // Note: Only consider instructions with <= 2 operands to avoid +        // complexity explosion. +        if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) +          return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes +                                  : VLOperands::ScoreSameOpcode; +      } + +      if (isa<UndefValue>(V2)) +        return VLOperands::ScoreUndef; + +      return VLOperands::ScoreFail; +    } + +    /// Holds the values and their lane that are taking part in the look-ahead +    /// score calculation. This is used in the external uses cost calculation. +    SmallDenseMap<Value *, int> InLookAheadValues; + +    /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are +    /// either external to the vectorized code, or require shuffling. +    int getExternalUsesCost(const std::pair<Value *, int> &LHS, +                            const std::pair<Value *, int> &RHS) { +      int Cost = 0; +      SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS}; +      for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { +        Value *V = Values[Idx].first; +        // Calculate the absolute lane, using the minimum relative lane of LHS +        // and RHS as base and Idx as the offset. +        int Ln = std::min(LHS.second, RHS.second) + Idx; +        assert(Ln >= 0 && "Bad lane calculation"); +        for (User *U : V->users()) { +          if (const TreeEntry *UserTE = R.getTreeEntry(U)) { +            // The user is in the VectorizableTree. Check if we need to insert. +            auto It = llvm::find(UserTE->Scalars, U); +            assert(It != UserTE->Scalars.end() && "U is in UserTE"); +            int UserLn = std::distance(UserTE->Scalars.begin(), It); +            assert(UserLn >= 0 && "Bad lane"); +            if (UserLn != Ln) +              Cost += UserInDiffLaneCost; +          } else { +            // Check if the user is in the look-ahead code. +            auto It2 = InLookAheadValues.find(U); +            if (It2 != InLookAheadValues.end()) { +              // The user is in the look-ahead code. Check the lane. +              if (It2->second != Ln) +                Cost += UserInDiffLaneCost; +            } else { +              // The user is neither in SLP tree nor in the look-ahead code. +              Cost += ExternalUseCost; +            } +          } +        } +      } +      return Cost; +    } + +    /// Go through the operands of \p LHS and \p RHS recursively until \p +    /// MaxLevel, and return the cummulative score. For example: +    /// \verbatim +    ///  A[0]  B[0]  A[1]  B[1]  C[0] D[0]  B[1] A[1] +    ///     \ /         \ /         \ /        \ / +    ///      +           +           +          + +    ///     G1          G2          G3         G4 +    /// \endverbatim +    /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at +    /// each level recursively, accumulating the score. It starts from matching +    /// the additions at level 0, then moves on to the loads (level 1). The +    /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and +    /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while +    /// {A[0],C[0]} has a score of VLOperands::ScoreFail. +    /// Please note that the order of the operands does not matter, as we +    /// evaluate the score of all profitable combinations of operands. In +    /// other words the score of G1 and G4 is the same as G1 and G2. This +    /// heuristic is based on ideas described in: +    ///   Look-ahead SLP: Auto-vectorization in the presence of commutative +    ///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, +    ///   Luís F. W. Góes +    int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, +                           const std::pair<Value *, int> &RHS, int CurrLevel, +                           int MaxLevel) { + +      Value *V1 = LHS.first; +      Value *V2 = RHS.first; +      // Get the shallow score of V1 and V2. +      int ShallowScoreAtThisLevel = +          std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - +                                       getExternalUsesCost(LHS, RHS)); +      int Lane1 = LHS.second; +      int Lane2 = RHS.second; + +      // If reached MaxLevel, +      //  or if V1 and V2 are not instructions, +      //  or if they are SPLAT, +      //  or if they are not consecutive, early return the current cost. +      auto *I1 = dyn_cast<Instruction>(V1); +      auto *I2 = dyn_cast<Instruction>(V2); +      if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || +          ShallowScoreAtThisLevel == VLOperands::ScoreFail || +          (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) +        return ShallowScoreAtThisLevel; +      assert(I1 && I2 && "Should have early exited."); + +      // Keep track of in-tree values for determining the external-use cost. +      InLookAheadValues[V1] = Lane1; +      InLookAheadValues[V2] = Lane2; + +      // Contains the I2 operand indexes that got matched with I1 operands. +      SmallSet<unsigned, 4> Op2Used; + +      // Recursion towards the operands of I1 and I2. We are trying all possbile +      // operand pairs, and keeping track of the best score. +      for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); +           OpIdx1 != NumOperands1; ++OpIdx1) { +        // Try to pair op1I with the best operand of I2. +        int MaxTmpScore = 0; +        unsigned MaxOpIdx2 = 0; +        bool FoundBest = false; +        // If I2 is commutative try all combinations. +        unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; +        unsigned ToIdx = isCommutative(I2) +                             ? I2->getNumOperands() +                             : std::min(I2->getNumOperands(), OpIdx1 + 1); +        assert(FromIdx <= ToIdx && "Bad index"); +        for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { +          // Skip operands already paired with OpIdx1. +          if (Op2Used.count(OpIdx2)) +            continue; +          // Recursively calculate the cost at each level +          int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, +                                            {I2->getOperand(OpIdx2), Lane2}, +                                            CurrLevel + 1, MaxLevel); +          // Look for the best score. +          if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { +            MaxTmpScore = TmpScore; +            MaxOpIdx2 = OpIdx2; +            FoundBest = true; +          } +        } +        if (FoundBest) { +          // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. +          Op2Used.insert(MaxOpIdx2); +          ShallowScoreAtThisLevel += MaxTmpScore; +        } +      } +      return ShallowScoreAtThisLevel; +    } + +    /// \Returns the look-ahead score, which tells us how much the sub-trees +    /// rooted at \p LHS and \p RHS match, the more they match the higher the +    /// score. This helps break ties in an informed way when we cannot decide on +    /// the order of the operands by just considering the immediate +    /// predecessors. +    int getLookAheadScore(const std::pair<Value *, int> &LHS, +                          const std::pair<Value *, int> &RHS) { +      InLookAheadValues.clear(); +      return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); +    } +      // Search all operands in Ops[*][Lane] for the one that matches best      // Ops[OpIdx][LastLane] and return its opreand index.      // If no good match can be found, return None. @@ -750,9 +962,6 @@ public:        // The linearized opcode of the operand at OpIdx, Lane.        bool OpIdxAPO = getData(OpIdx, Lane).APO; -      const unsigned BestScore = 2; -      const unsigned GoodScore = 1; -        // The best operand index and its score.        // Sometimes we have more than one option (e.g., Opcode and Undefs), so we        // are using the score to differentiate between the two. @@ -781,41 +990,19 @@ public:          // Look for an operand that matches the current mode.          switch (RMode) {          case ReorderingMode::Load: -          if (isa<LoadInst>(Op)) { -            // Figure out which is left and right, so that we can check for -            // consecutive loads -            bool LeftToRight = Lane > LastLane; -            Value *OpLeft = (LeftToRight) ? OpLastLane : Op; -            Value *OpRight = (LeftToRight) ? Op : OpLastLane; -            if (isConsecutiveAccess(cast<LoadInst>(OpLeft), -                                    cast<LoadInst>(OpRight), DL, SE)) -              BestOp.Idx = Idx; -          } -          break; -        case ReorderingMode::Opcode: -          // We accept both Instructions and Undefs, but with different scores. -          if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) && -               cast<Instruction>(Op)->getOpcode() == -                   cast<Instruction>(OpLastLane)->getOpcode()) || -              (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) || -              isa<UndefValue>(Op)) { -            // An instruction has a higher score than an undef. -            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; -            if (Score > BestOp.Score) { -              BestOp.Idx = Idx; -              BestOp.Score = Score; -            } -          } -          break;          case ReorderingMode::Constant: -          if (isa<Constant>(Op)) { -            unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; -            if (Score > BestOp.Score) { -              BestOp.Idx = Idx; -              BestOp.Score = Score; -            } +        case ReorderingMode::Opcode: { +          bool LeftToRight = Lane > LastLane; +          Value *OpLeft = (LeftToRight) ? OpLastLane : Op; +          Value *OpRight = (LeftToRight) ? Op : OpLastLane; +          unsigned Score = +              getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); +          if (Score > BestOp.Score) { +            BestOp.Idx = Idx; +            BestOp.Score = Score;            }            break; +        }          case ReorderingMode::Splat:            if (Op == OpLastLane)              BestOp.Idx = Idx; @@ -946,8 +1133,8 @@ public:    public:      /// Initialize with all the operands of the instruction vector \p RootVL.      VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, -               ScalarEvolution &SE) -        : DL(DL), SE(SE) { +               ScalarEvolution &SE, const BoUpSLP &R) +        : DL(DL), SE(SE), R(R) {        // Append all the operands of RootVL.        appendOperandsOfVL(RootVL);      } @@ -1169,7 +1356,8 @@ private:                                               SmallVectorImpl<Value *> &Left,                                               SmallVectorImpl<Value *> &Right,                                               const DataLayout &DL, -                                             ScalarEvolution &SE); +                                             ScalarEvolution &SE, +                                             const BoUpSLP &R);    struct TreeEntry {      using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;      TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -2371,7 +2559,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,          // Commutative predicate - collect + sort operands of the instructions          // so that each side is more likely to have the same opcode.          assert(P0 == SwapP0 && "Commutative Predicate mismatch"); -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);        } else {          // Collect operands - commute if it uses the swapped predicate.          for (Value *V : VL) { @@ -2416,7 +2604,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        // have the same opcode.        if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {          ValueList Left, Right; -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);          buildTree_rec(Left, Depth + 1, {TE, 0});          buildTree_rec(Right, Depth + 1, {TE, 1});          return; @@ -2585,7 +2773,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,        // Reorder operands if reordering would enable vectorization.        if (isa<BinaryOperator>(VL0)) {          ValueList Left, Right; -        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); +        reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);          buildTree_rec(Left, Depth + 1, {TE, 0});          buildTree_rec(Right, Depth + 1, {TE, 1});          return; @@ -3302,13 +3490,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {  // Perform operand reordering on the instructions in VL and return the reordered  // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode( -    ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, -    SmallVectorImpl<Value *> &Right, const DataLayout &DL, -    ScalarEvolution &SE) { +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, +                                             SmallVectorImpl<Value *> &Left, +                                             SmallVectorImpl<Value *> &Right, +                                             const DataLayout &DL, +                                             ScalarEvolution &SE, +                                             const BoUpSLP &R) {    if (VL.empty())      return; -  VLOperands Ops(VL, DL, SE); +  VLOperands Ops(VL, DL, SE, R);    // Reorder the operands in place.    Ops.reorder();    Left = Ops.getVL(0); | 

