diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-06-21 17:57:01 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2019-06-21 17:57:01 +0000 |
commit | 5698921be2d567f6abf925479ac9f5a376d6d74f (patch) | |
tree | 3d8514354279b7db7837e601f173f6039cd754ac /llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | |
parent | 2441a4074c19257849b54be9b5e2a5c0ea87a48c (diff) | |
download | bcm5719-llvm-5698921be2d567f6abf925479ac9f5a376d6d74f.tar.gz bcm5719-llvm-5698921be2d567f6abf925479ac9f5a376d6d74f.zip |
[SLP] Look-ahead operand reordering heuristic.
This patch introduces a new heuristic for guiding operand reordering. The new "look-ahead" heuristic can look beyond the immediate predecessors. This helps break ties when the immediate predecessors have identical opcodes (see lit test for an example).
Committed on behalf of @vporpo (Vasileios Porpodas)
Differential Revision: https://reviews.llvm.org/D60897
llvm-svn: 364084
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 278 |
1 files changed, 232 insertions, 46 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2da9ead14ca..50168afeaa3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -147,6 +147,12 @@ static cl::opt<unsigned> MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +// The maximum depth that the look-ahead score heuristic will explore. +// The higher this value, the higher the compilation time overhead. +static cl::opt<int> LookAheadMaxDepth( + "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for operand reordering scores")); + static cl::opt<bool> ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -708,6 +714,7 @@ public: const DataLayout &DL; ScalarEvolution &SE; + const BoUpSLP &R; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -733,6 +740,207 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } + // The hard-coded scores listed here are not very important. When computing + // the scores of matching one sub-tree with another, we are basically + // counting the number of values that are matching. So even if all scores + // are set to 1, we would still get a decent matching result. + // However, sometimes we have to break ties. For example we may have to + // choose between matching loads vs matching opcodes. This is what these + // scores are helping us with: they provide the order of preference. + + /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). + static const int ScoreConsecutiveLoads = 3; + /// Constants. + static const int ScoreConstants = 2; + /// Instructions with the same opcode. + static const int ScoreSameOpcode = 2; + /// Instructions with alt opcodes (e.g, add + sub). + static const int ScoreAltOpcodes = 1; + /// Identical instructions (a.k.a. splat or broadcast). + static const int ScoreSplat = 1; + /// Matching with an undef is preferable to failing. + static const int ScoreUndef = 1; + /// Score for failing to find a decent match. + static const int ScoreFail = 0; + /// User external to the vectorized code. + static const int ExternalUseCost = 1; + /// The user is internal but in a different lane. + static const int UserInDiffLaneCost = ExternalUseCost; + + /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. + static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, + ScalarEvolution &SE) { + auto *LI1 = dyn_cast<LoadInst>(V1); + auto *LI2 = dyn_cast<LoadInst>(V2); + if (LI1 && LI2) + return isConsecutiveAccess(LI1, LI2, DL, SE) + ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + + auto *C1 = dyn_cast<Constant>(V1); + auto *C2 = dyn_cast<Constant>(V2); + if (C1 && C2) + return VLOperands::ScoreConstants; + + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (I1 && I2) { + if (I1 == I2) + return VLOperands::ScoreSplat; + InstructionsState S = getSameOpcode({I1, I2}); + // Note: Only consider instructions with <= 2 operands to avoid + // complexity explosion. + if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) + return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes + : VLOperands::ScoreSameOpcode; + } + + if (isa<UndefValue>(V2)) + return VLOperands::ScoreUndef; + + return VLOperands::ScoreFail; + } + + /// Holds the values and their lane that are taking part in the look-ahead + /// score calculation. This is used in the external uses cost calculation. + SmallDenseMap<Value *, int> InLookAheadValues; + + /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// either external to the vectorized code, or require shuffling. + int getExternalUsesCost(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + int Cost = 0; + SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS}; + for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { + Value *V = Values[Idx].first; + // Calculate the absolute lane, using the minimum relative lane of LHS + // and RHS as base and Idx as the offset. + int Ln = std::min(LHS.second, RHS.second) + Idx; + assert(Ln >= 0 && "Bad lane calculation"); + for (User *U : V->users()) { + if (const TreeEntry *UserTE = R.getTreeEntry(U)) { + // The user is in the VectorizableTree. Check if we need to insert. + auto It = llvm::find(UserTE->Scalars, U); + assert(It != UserTE->Scalars.end() && "U is in UserTE"); + int UserLn = std::distance(UserTE->Scalars.begin(), It); + assert(UserLn >= 0 && "Bad lane"); + if (UserLn != Ln) + Cost += UserInDiffLaneCost; + } else { + // Check if the user is in the look-ahead code. + auto It2 = InLookAheadValues.find(U); + if (It2 != InLookAheadValues.end()) { + // The user is in the look-ahead code. Check the lane. + if (It2->second != Ln) + Cost += UserInDiffLaneCost; + } else { + // The user is neither in SLP tree nor in the look-ahead code. + Cost += ExternalUseCost; + } + } + } + } + return Cost; + } + + /// Go through the operands of \p LHS and \p RHS recursively until \p + /// MaxLevel, and return the cummulative score. For example: + /// \verbatim + /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] + /// \ / \ / \ / \ / + /// + + + + + /// G1 G2 G3 G4 + /// \endverbatim + /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at + /// each level recursively, accumulating the score. It starts from matching + /// the additions at level 0, then moves on to the loads (level 1). The + /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and + /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// Please note that the order of the operands does not matter, as we + /// evaluate the score of all profitable combinations of operands. In + /// other words the score of G1 and G4 is the same as G1 and G2. This + /// heuristic is based on ideas described in: + /// Look-ahead SLP: Auto-vectorization in the presence of commutative + /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, + /// Luís F. W. Góes + int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS, int CurrLevel, + int MaxLevel) { + + Value *V1 = LHS.first; + Value *V2 = RHS.first; + // Get the shallow score of V1 and V2. + int ShallowScoreAtThisLevel = + std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - + getExternalUsesCost(LHS, RHS)); + int Lane1 = LHS.second; + int Lane2 = RHS.second; + + // If reached MaxLevel, + // or if V1 and V2 are not instructions, + // or if they are SPLAT, + // or if they are not consecutive, early return the current cost. + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || + ShallowScoreAtThisLevel == VLOperands::ScoreFail || + (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + return ShallowScoreAtThisLevel; + assert(I1 && I2 && "Should have early exited."); + + // Keep track of in-tree values for determining the external-use cost. + InLookAheadValues[V1] = Lane1; + InLookAheadValues[V2] = Lane2; + + // Contains the I2 operand indexes that got matched with I1 operands. + SmallSet<int, 4> Op2Used; + + // Recursion towards the operands of I1 and I2. We are trying all possbile + // operand pairs, and keeping track of the best score. + for (int OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); + OpIdx1 != NumOperands1; ++OpIdx1) { + // Try to pair op1I with the best operand of I2. + int MaxTmpScore = 0; + int MaxOpIdx2 = -1; + // If I2 is commutative try all combinations. + int FromIdx = isCommutative(I2) ? 0 : OpIdx1; + int ToIdx = isCommutative(I2) ? I2->getNumOperands() : OpIdx1 + 1; + assert(FromIdx < ToIdx && "Bad index"); + for (int OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { + // Skip operands already paired with OpIdx1. + if (Op2Used.count(OpIdx2)) + continue; + // Recursively calculate the cost at each level + int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, + {I2->getOperand(OpIdx2), Lane2}, + CurrLevel + 1, MaxLevel); + // Look for the best score. + if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + MaxTmpScore = TmpScore; + MaxOpIdx2 = OpIdx2; + } + } + if (MaxOpIdx2 >= 0) { + // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. + Op2Used.insert(MaxOpIdx2); + ShallowScoreAtThisLevel += MaxTmpScore; + } + } + return ShallowScoreAtThisLevel; + } + + /// \Returns the look-ahead score, which tells us how much the sub-trees + /// rooted at \p LHS and \p RHS match, the more they match the higher the + /// score. This helps break ties in an informed way when we cannot decide on + /// the order of the operands by just considering the immediate + /// predecessors. + int getLookAheadScore(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + InLookAheadValues.clear(); + return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); + } + // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. @@ -750,9 +958,6 @@ public: // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; - const unsigned BestScore = 2; - const unsigned GoodScore = 1; - // The best operand index and its score. // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. @@ -781,41 +986,19 @@ public: // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: - if (isa<LoadInst>(Op)) { - // Figure out which is left and right, so that we can check for - // consecutive loads - bool LeftToRight = Lane > LastLane; - Value *OpLeft = (LeftToRight) ? OpLastLane : Op; - Value *OpRight = (LeftToRight) ? Op : OpLastLane; - if (isConsecutiveAccess(cast<LoadInst>(OpLeft), - cast<LoadInst>(OpRight), DL, SE)) - BestOp.Idx = Idx; - } - break; - case ReorderingMode::Opcode: - // We accept both Instructions and Undefs, but with different scores. - if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) && - cast<Instruction>(Op)->getOpcode() == - cast<Instruction>(OpLastLane)->getOpcode()) || - (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) || - isa<UndefValue>(Op)) { - // An instruction has a higher score than an undef. - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } - } - break; case ReorderingMode::Constant: - if (isa<Constant>(Op)) { - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } + case ReorderingMode::Opcode: { + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + unsigned Score = + getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; } break; + } case ReorderingMode::Splat: if (Op == OpLastLane) BestOp.Idx = Idx; @@ -946,8 +1129,8 @@ public: public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, - ScalarEvolution &SE) - : DL(DL), SE(SE) { + ScalarEvolution &SE, const BoUpSLP &R) + : DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1169,7 +1352,8 @@ private: SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE); + ScalarEvolution &SE, + const BoUpSLP &R); struct TreeEntry { using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -2371,7 +2555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -2415,7 +2599,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -2584,7 +2768,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -3299,13 +3483,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode( - ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE) { +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + const DataLayout &DL, + ScalarEvolution &SE, + const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE); + VLOperands Ops(VL, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); |