From bdac9f30c0bf0251e59f84a101230dd61ebfcf16 Mon Sep 17 00:00:00 2001 From: Mohammad Shahid Date: Fri, 3 Mar 2017 10:02:47 +0000 Subject: [SLP] Fixes the bug due to absence of in order uses of scalars which needs to be available for VectorizeTree() API.This API uses it for proper mask computation to be used in shufflevector IR. The fix is to compute the mask for out of order memory accesses while building the vectorizable tree instead of actual vectorization of vectorizable tree.It also needs to recompute the proper Lane for external use of vectorizable scalars based on shuffle mask. Reviewers: mkuper Differential Revision: https://reviews.llvm.org/D30159 Change-Id: Ide8773ce0ad3562f3cf4d1a0ad0f487e2f60ce5d llvm-svn: 296863 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 151 +++++++++++++----------- 1 file changed, 79 insertions(+), 72 deletions(-) (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp') diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index edaab89ae46..f056679ae89 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -423,10 +423,8 @@ private: /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). bool canReuseExtract(ArrayRef VL, unsigned Opcode) const; - /// Vectorize a single entry in the tree. VL icontains all isomorphic scalars - /// in order of its usage in a user program, for example ADD1, ADD2 and so on - /// or LOAD1 , LOAD2 etc. - Value *vectorizeTree(ArrayRef VL, TreeEntry *E); + /// Vectorize a single entry in the tree. + Value *vectorizeTree(TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. Value *vectorizeTree(ArrayRef VL); @@ -466,8 +464,8 @@ private: SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { - TreeEntry() : Scalars(), VectorizedValue(nullptr), - NeedToGather(0), NeedToShuffle(0) {} + TreeEntry() + : Scalars(), VectorizedValue(nullptr), NeedToGather(0), ShuffleMask() {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { @@ -495,19 +493,23 @@ private: /// Do we need to gather this sequence ? bool NeedToGather; - /// Do we need to shuffle the load ? - bool NeedToShuffle; + /// Records optional suffle mask for jumbled memory accesses in this. + SmallVector ShuffleMask; + }; /// Create a new VectorizableTree entry. TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, - bool NeedToShuffle) { + ArrayRef ShuffleMask = None) { VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; - Last->NeedToShuffle = NeedToShuffle; + if (!ShuffleMask.empty()) + Last->ShuffleMask.insert(Last->ShuffleMask.begin(), ShuffleMask.begin(), + ShuffleMask.end()); + if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); @@ -1030,21 +1032,21 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } // Don't handle vectors. if (VL[0]->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } if (StoreInst *SI = dyn_cast(VL[0])) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } unsigned Opcode = getSameOpcode(VL); @@ -1061,7 +1063,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } @@ -1073,7 +1075,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } @@ -1086,7 +1088,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } @@ -1099,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (ScalarToTreeEntry.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } @@ -1109,7 +1111,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } @@ -1123,7 +1125,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } @@ -1132,7 +1134,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned j = i+1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } @@ -1147,7 +1149,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1164,12 +1166,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1191,7 +1193,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } else { BS.cancelScheduling(VL); } - newTreeEntry(VL, Reuse, false); + newTreeEntry(VL, Reuse); return; } case Instruction::Load: { @@ -1207,7 +1209,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1218,7 +1220,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1238,7 +1240,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1255,7 +1257,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (VL.size() > 2 && !ReverseConsecutive) { bool ShuffledLoads = true; SmallVector Sorted; - if (sortMemAccesses(VL, *DL, *SE, Sorted)) { + SmallVector Mask; + if (sortMemAccesses(VL, *DL, *SE, Sorted, &Mask)) { auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end()); for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) { if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) { @@ -1264,14 +1267,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } } if (ShuffledLoads) { - newTreeEntry(NewVL, true, true); + newTreeEntry(NewVL, true, makeArrayRef(Mask.begin(), Mask.end())); return; } } } BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1298,12 +1301,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Type *Ty = cast(Val)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1326,13 +1329,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1364,7 +1367,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1393,7 +1396,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (cast(Val)->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } @@ -1406,7 +1409,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } @@ -1418,12 +1421,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); return; } } - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1440,12 +1443,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1463,7 +1466,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1477,7 +1480,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1488,7 +1491,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1501,14 +1504,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1525,11 +1528,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // then do not vectorize this instruction. if (!isAltShuffle) { BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, false); + newTreeEntry(VL, true); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1553,7 +1556,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } default: BS.cancelScheduling(VL); - newTreeEntry(VL, false, false); + newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1792,7 +1795,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0); - if (E->NeedToShuffle) { + if (!E->ShuffleMask.empty()) { VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0); } @@ -2358,8 +2361,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { if (ScalarToTreeEntry.count(VL[0])) { int Idx = ScalarToTreeEntry[VL[0]]; TreeEntry *E = &VectorizableTree[Idx]; - if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL, *SE))) - return vectorizeTree(VL, E); + if (E->isSame(VL) || + (!E->ShuffleMask.empty() && E->isFoundJumbled(VL, *DL, *SE))) + return vectorizeTree(E); } Type *ScalarTy = VL[0]->getType(); @@ -2370,10 +2374,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { return Gather(VL, VecTy); } -Value *BoUpSLP::vectorizeTree(ArrayRef VL, TreeEntry *E) { +Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); - if (E->VectorizedValue && !E->NeedToShuffle) { + if (E->VectorizedValue && E->ShuffleMask.empty()) { DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } @@ -2611,27 +2615,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL, TreeEntry *E) { // As program order of scalar loads are jumbled, the vectorized 'load' // must be followed by a 'shuffle' with the required jumbled mask. - if (!VL.empty() && (E->NeedToShuffle)) { - assert(VL.size() == E->Scalars.size() && - "Equal number of scalars expected"); + if (!E->ShuffleMask.empty()) { SmallVector Mask; - for (Value *Val : VL) { - if (ScalarToTreeEntry.count(Val)) { - int Idx = ScalarToTreeEntry[Val]; - TreeEntry *E = &VectorizableTree[Idx]; - for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { - if (E->Scalars[Lane] == Val) { - Mask.push_back(Builder.getInt32(Lane)); - break; - } - } - } + for (unsigned Lane = 0, LE = E->ShuffleMask.size(); Lane != LE; + ++Lane) { + Mask.push_back(Builder.getInt32(E->ShuffleMask[Lane])); } - // Generate shuffle for jumbled memory access Value *Undef = UndefValue::get(VecTy); Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, ConstantVector::get(Mask)); + E->VectorizedValue = Shuf; + ++NumVectorInstructions; return Shuf; } @@ -2816,7 +2811,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(ArrayRef(), &VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We @@ -2861,8 +2856,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); - - Value *Lane = Builder.getInt32(ExternalUse.Lane); + unsigned i = 0; + Value *Lane; + // In case vectorizable scalars use are not in-order, scalars would have + // been shuffled.Recompute the proper Lane of ExternalUse. + if (!E->ShuffleMask.empty()) { + SmallVector Val(E->ShuffleMask.size()); + for (; i < E->ShuffleMask.size(); i++) { + if (E->ShuffleMask[i] == (unsigned)ExternalUse.Lane) + break; + } + Lane = Builder.getInt32(i); + } else { + Lane = Builder.getInt32(ExternalUse.Lane); + } // If User == nullptr, the Scalar is used as extra arg. Generate // ExtractElement instruction and update the record for this scalar in // ExternallyUsedValues. -- cgit v1.2.3