From 3121334d3218f8377b075c3cf7077b00ed12a83d Mon Sep 17 00:00:00 2001 From: Mohammad Shahid Date: Sat, 28 Jan 2017 17:59:44 +0000 Subject: [SLP] Vectorize loads of consecutive memory accesses, accessed in non-consecutive (jumbled) way. The jumbled scalar loads will be sorted while building the tree and these accesses will be marked to generate shufflevector after the vectorized load with proper mask. Reviewers: hfinkel, mssimpso, mkuper Differential Revision: https://reviews.llvm.org/D26905 Change-Id: I9c0c8e6f91a00076a7ee1465440a3f6ae092f7ad llvm-svn: 293386 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 177 ++++++++++++++++-------- 1 file changed, 120 insertions(+), 57 deletions(-) (limited to 'llvm/lib/Transforms/Vectorize') diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e27814526ce..eeaf29dceb5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -410,8 +410,10 @@ private: /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). bool canReuseExtract(ArrayRef VL, unsigned Opcode) const; - /// Vectorize a single entry in the tree. - Value *vectorizeTree(TreeEntry *E); + /// Vectorize a single entry in the tree. VL icontains all isomorphic scalars + /// in order of its usage in a user program, for example ADD1, ADD2 and so on + /// or LOAD1 , LOAD2 etc. + Value *vectorizeTree(ArrayRef VL, TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. Value *vectorizeTree(ArrayRef VL); @@ -452,7 +454,7 @@ private: SmallVectorImpl &Right); struct TreeEntry { TreeEntry() : Scalars(), VectorizedValue(nullptr), - NeedToGather(0) {} + NeedToGather(0), NeedToShuffle(0) {} /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { @@ -460,6 +462,15 @@ private: return std::equal(VL.begin(), VL.end(), Scalars.begin()); } + /// \returns true if the scalars in VL are found in this tree entry. + bool isFoundJumbled(ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE) const { + assert(VL.size() == Scalars.size() && "Invalid size"); + SmallVector List; + sortMemAccesses(VL, DL, SE, List); + return std::equal(List.begin(), List.end(), Scalars.begin()); + } + /// A vector of scalars. ValueList Scalars; @@ -468,15 +479,20 @@ private: /// Do we need to gather this sequence ? bool NeedToGather; + + /// Do we need to shuffle the load ? + bool NeedToShuffle; }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized) { + TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, + bool NeedToShuffle) { VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; + Last->NeedToShuffle = NeedToShuffle; if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); @@ -983,21 +999,21 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } // Don't handle vectors. if (VL[0]->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } if (StoreInst *SI = dyn_cast(VL[0])) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } unsigned Opcode = getSameOpcode(VL); @@ -1014,7 +1030,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } @@ -1026,7 +1042,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1039,7 +1055,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1052,7 +1068,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (ScalarToTreeEntry.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1062,7 +1078,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1076,7 +1092,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } @@ -1085,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned j = i+1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } @@ -1100,7 +1116,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { assert((!BS.getScheduleData(VL[0]) || !BS.getScheduleData(VL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1117,12 +1133,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1144,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } else { BS.cancelScheduling(VL); } - newTreeEntry(VL, Reuse); + newTreeEntry(VL, Reuse, false); return; } case Instruction::Load: { @@ -1160,7 +1176,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1171,15 +1187,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } } // Check if the loads are consecutive, reversed, or neither. - // TODO: What we really want is to sort the loads, but for now, check - // the two likely directions. bool Consecutive = true; bool ReverseConsecutive = true; for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { @@ -1193,7 +1207,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1207,8 +1221,25 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { break; } + if (VL.size() > 2 && !ReverseConsecutive) { + bool ShuffledLoads = true; + SmallVector List; + sortMemAccesses(VL, *DL, *SE, List); + auto NewVL = makeArrayRef(List.begin(), List.end()); + for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) { + if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) { + ShuffledLoads = false; + break; + } + } + if (ShuffledLoads) { + newTreeEntry(NewVL, true, true); + return; + } + } + BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1231,16 +1262,16 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast(VL[i])->getOperand(0)->getType(); + for (Value *Val : VL) { + Type *Ty = cast(Val)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1263,13 +1294,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1301,7 +1332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1326,11 +1357,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast(VL[j])->getNumOperands() != 2) { + for (Value *Val : VL) { + if (cast(Val)->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } @@ -1338,29 +1369,29 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = cast(VL0)->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast(VL[j])->getOperand(0)->getType(); + for (Value *Val : VL) { + Type *CurTy = cast(Val)->getOperand(0)->getType(); if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast(VL[j])->getOperand(1); + for (Value *Val : VL) { + auto Op = cast(Val)->getOperand(1); if (!isa(Op)) { DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1377,12 +1408,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1400,7 +1431,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1414,7 +1445,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1425,7 +1456,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1438,14 +1469,14 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1462,11 +1493,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { // then do not vectorize this instruction. if (!isAltShuffle) { BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, false); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1490,7 +1521,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { } default: BS.cancelScheduling(VL); - newTreeEntry(VL, false); + newTreeEntry(VL, false, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1723,6 +1754,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0); + if (E->NeedToShuffle) { + VecLdCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0); + } return VecLdCost - ScalarLdCost; } case Instruction::Store: { @@ -2285,8 +2320,8 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { if (ScalarToTreeEntry.count(VL[0])) { int Idx = ScalarToTreeEntry[VL[0]]; TreeEntry *E = &VectorizableTree[Idx]; - if (E->isSame(VL)) - return vectorizeTree(E); + if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL, *SE))) + return vectorizeTree(VL, E); } Type *ScalarTy = VL[0]->getType(); @@ -2297,10 +2332,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { return Gather(VL, VecTy); } -Value *BoUpSLP::vectorizeTree(TreeEntry *E) { +Value *BoUpSLP::vectorizeTree(ArrayRef VL, TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); - if (E->VectorizedValue) { + if (E->VectorizedValue && !E->NeedToShuffle) { DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); return E->VectorizedValue; } @@ -2534,7 +2569,35 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { LI->setAlignment(Alignment); E->VectorizedValue = LI; ++NumVectorInstructions; - return propagateMetadata(LI, E->Scalars); + propagateMetadata(LI, E->Scalars); + + // As program order of scalar loads are jumbled, the vectorized 'load' + // must be followed by a 'shuffle' with the required jumbled mask. + if (!VL.empty() && (E->NeedToShuffle)) { + assert(VL.size() == E->Scalars.size() && + "Equal number of scalars expected"); + SmallVector Mask; + for (Value *Val : VL) { + if (ScalarToTreeEntry.count(Val)) { + int Idx = ScalarToTreeEntry[Val]; + TreeEntry *E = &VectorizableTree[Idx]; + for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { + if (E->Scalars[Lane] == Val) { + Mask.push_back(Builder.getInt32(Lane)); + break; + } + } + } + } + + // Generate shuffle for jumbled memory access + Value *Undef = UndefValue::get(VecTy); + Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef, + ConstantVector::get(Mask)); + return Shuf; + } + + return LI; } case Instruction::Store: { StoreInst *SI = cast(VL0); @@ -2709,7 +2772,7 @@ Value *BoUpSLP::vectorizeTree() { } Builder.SetInsertPoint(&F->getEntryBlock().front()); - auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(ArrayRef(), &VectorizableTree[0]); // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We -- cgit v1.2.3