diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 258 |
1 files changed, 41 insertions, 217 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5b0b2e0cf19..f69a4e52c7e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -412,13 +412,6 @@ public: return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } - /// \return The vector element size in bits to use when vectorizing the - /// expression tree ending at \p V. If V is a store, the size is the width of - /// the stored value. Otherwise, the size is the width of the largest loaded - /// value reaching V. This method is used by the vectorizer to calculate - /// vectorization factors. - unsigned getVectorElementSize(Value *V); - private: struct TreeEntry; @@ -3146,73 +3139,10 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { BS->ScheduleStart = nullptr; } -unsigned BoUpSLP::getVectorElementSize(Value *V) { - auto &DL = F->getParent()->getDataLayout(); - - // If V is a store, just return the width of the stored value without - // traversing the expression tree. This is the common case. - if (auto *Store = dyn_cast<StoreInst>(V)) - return DL.getTypeSizeInBits(Store->getValueOperand()->getType()); - - // If V is not a store, we can traverse the expression tree to find loads - // that feed it. The type of the loaded value may indicate a more suitable - // width than V's type. We want to base the vector element size on the width - // of memory operations where possible. - SmallVector<Instruction *, 16> Worklist; - SmallPtrSet<Instruction *, 16> Visited; - if (auto *I = dyn_cast<Instruction>(V)) - Worklist.push_back(I); - - // Traverse the expression tree in bottom-up order looking for loads. If we - // encounter an instruciton we don't yet handle, we give up. - auto MaxWidth = 0u; - auto FoundUnknownInst = false; - while (!Worklist.empty() && !FoundUnknownInst) { - auto *I = Worklist.pop_back_val(); - Visited.insert(I); - - // We should only be looking at scalar instructions here. If the current - // instruction has a vector type, give up. - auto *Ty = I->getType(); - if (isa<VectorType>(Ty)) - FoundUnknownInst = true; - - // If the current instruction is a load, update MaxWidth to reflect the - // width of the loaded value. - else if (isa<LoadInst>(I)) - MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty)); - - // Otherwise, we need to visit the operands of the instruction. We only - // handle the interesting cases from buildTree here. If an operand is an - // instruction we haven't yet visited, we add it to the worklist. - else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || - isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { - for (Use &U : I->operands()) - if (auto *J = dyn_cast<Instruction>(U.get())) - if (!Visited.count(J)) - Worklist.push_back(J); - } - - // If we don't yet handle the instruction, give up. - else - FoundUnknownInst = true; - } - - // If we didn't encounter a memory access in the expression tree, or if we - // gave up for some reason, just return the width of V. - if (!MaxWidth || FoundUnknownInst) - return DL.getTypeSizeInBits(V->getType()); - - // Otherwise, return the maximum width we found. - return MaxWidth; -} - /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { typedef SmallVector<StoreInst *, 8> StoreList; typedef MapVector<Value *, StoreList> StoreListMap; - typedef SmallVector<WeakVH, 8> WeakVHList; - typedef MapVector<Value *, WeakVHList> WeakVHListMap; /// Pass identification, replacement for typeid static char ID; @@ -3242,8 +3172,7 @@ struct SLPVectorizer : public FunctionPass { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - Stores.clear(); - GEPs.clear(); + StoreRefs.clear(); bool Changed = false; // If the target claims to have no vector registers don't attempt @@ -3277,24 +3206,15 @@ struct SLPVectorizer : public FunctionPass { // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { - collectSeedInstructions(BB); - // Vectorize trees that end at stores. - if (NumStores > 0) { - DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n"); + if (unsigned count = collectStores(BB, R)) { + (void)count; + DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); Changed |= vectorizeStoreChains(R); } // Vectorize trees that end at reductions. Changed |= vectorizeChainsInBlock(BB, R); - - // Vectorize the index computations of getelementptr instructions. This - // is primarily intended to catch gather-like idioms ending at - // non-consecutive loads. - if (NumGEPs > 0) { - DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n"); - Changed |= vectorizeGEPIndices(BB, R); - } } if (Changed) { @@ -3321,14 +3241,12 @@ struct SLPVectorizer : public FunctionPass { } private: - /// \brief Collect store and getelementptr instructions and organize them - /// according to the underlying object of their pointer operands. We sort the - /// instructions by their underlying objects to reduce the cost of - /// consecutive access queries. - /// - /// TODO: We can further reduce this cost if we flush the chain creation - /// every time we run into a memory barrier. - void collectSeedInstructions(BasicBlock *BB); + + /// \brief Collect memory references and sort them according to their base + /// object. We sort the stores to their base objects to reduce the cost of the + /// quadratic search on the stores. TODO: We can further reduce this cost + /// if we flush the chain creation every time we run into a memory barrier. + unsigned collectStores(BasicBlock *BB, BoUpSLP &R); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); @@ -3344,13 +3262,9 @@ private: /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); - /// \brief Vectorize the store instructions collected in Stores. + /// \brief Vectorize the stores that were collected in StoreRefs. bool vectorizeStoreChains(BoUpSLP &R); - /// \brief Vectorize the index computations of the getelementptr instructions - /// collected in GEPs. - bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R); - /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); @@ -3360,19 +3274,8 @@ private: bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, BoUpSLP &R); - - /// The store instructions in a basic block organized by base pointer. - StoreListMap Stores; - - /// The getelementptr instructions in a basic block organized by base pointer. - WeakVHListMap GEPs; - - /// The number of store instructions in a basic block. - unsigned NumStores; - - /// The number of getelementptr instructions in a basic block. - unsigned NumGEPs; - +private: + StoreListMap StoreRefs; unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; @@ -3393,7 +3296,9 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - unsigned Sz = R.getVectorElementSize(Chain[0]); + Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); + auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout(); + unsigned Sz = DL.getTypeSizeInBits(StoreTy); unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -3504,43 +3409,33 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, return Changed; } -void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) { - // Initialize the collections. We will make a single pass over the block. - Stores.clear(); - GEPs.clear(); - NumStores = NumGEPs = 0; +unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { + unsigned count = 0; + StoreRefs.clear(); const DataLayout &DL = BB->getModule()->getDataLayout(); - - // Visit the store and getelementptr instructions in BB and organize them in - // Stores and GEPs according to the underlying objects of their pointer - // operands. for (Instruction &I : *BB) { + StoreInst *SI = dyn_cast<StoreInst>(&I); + if (!SI) + continue; - // Ignore store instructions that are volatile or have a pointer operand - // that doesn't point to a scalar type. - if (auto *SI = dyn_cast<StoreInst>(&I)) { - if (!SI->isSimple()) - continue; - if (!isValidElementType(SI->getValueOperand()->getType())) - continue; - Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI); - ++NumStores; - } + // Don't touch volatile stores. + if (!SI->isSimple()) + continue; - // Ignore getelementptr instructions that have more than one index, a - // constant index, or a pointer operand that doesn't point to a scalar - // type. - else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { - auto Idx = GEP->idx_begin()->get(); - if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) - continue; - if (!isValidElementType(Idx->getType())) - continue; - GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP); - ++NumGEPs; - } + // Check that the pointer points to scalars. + Type *Ty = SI->getValueOperand()->getType(); + if (!isValidElementType(Ty)) + continue; + + // Find the base pointer. + Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); + + // Save the store locations. + StoreRefs[Ptr].push_back(SI); + count++; } + return count; } bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { @@ -3564,10 +3459,12 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, return false; unsigned Opcode0 = I0->getOpcode(); + const DataLayout &DL = I0->getModule()->getDataLayout(); + Type *Ty0 = I0->getType(); + unsigned Sz = DL.getTypeSizeInBits(Ty0); // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. - unsigned Sz = R.getVectorElementSize(I0); unsigned VF = MinVecRegSize / Sz; for (Value *V : VL) { @@ -4286,83 +4183,10 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return Changed; } -bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { - auto Changed = false; - for (auto &Entry : GEPs) { - auto &GEPList = Entry.second; - - // If the getelementptr list has fewer than two elements, there's nothing - // to do. - if (GEPList.size() < 2) - continue; - - DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " - << GEPList.size() << ".\n"); - - // Initialize a set a candidate getelementptrs. Note that we use a - // SetVector here to preserve program order. If the index computations are - // vectorizable and begin with loads, we want to minimize the chance of - // having to reorder them later. - SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); - - // Some of the candidates may have already been vectorized after we - // initially collected them. If so, the WeakVHs will have nullified the - // values, so remove them from the set of candidates. - Candidates.remove(nullptr); - - // Remove from the set of candidates all pairs of getelementptrs with - // constant differences. Such getelementptrs are likely not good candidates - // for vectorization in a bottom-up phase since one can be computed from - // the other. - for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { - auto *GEP = SE->getSCEV(GEPList[I]); - for (int J = I + 1; J < E && Candidates.size() > 1; ++J) - if (isa<SCEVConstant>(SE->getMinusSCEV(GEP, SE->getSCEV(GEPList[J])))) { - Candidates.remove(GEPList[I]); - Candidates.remove(GEPList[J]); - } - } - - // We break out of the above computation as soon as we know there are fewer - // than two candidates remaining. - if (Candidates.size() < 2) - continue; - - // Add the single, non-constant index of each candidate to the bundle. We - // ensured the indices met these constraints when we originally collected - // the getelementptrs. - SmallVector<Value *, 16> Bundle(Candidates.size()); - auto BundleIndex = 0u; - for (auto *V : Candidates) { - auto *GEP = cast<GetElementPtrInst>(V); - auto *GEPIdx = GEP->idx_begin()->get(); - assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx)); - Bundle[BundleIndex++] = GEPIdx; - } - - // Try and vectorize the indices. We are currently only interested in - // gather-like cases of the form: - // - // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... - // - // where the loads of "a", the loads of "b", and the subtractions can be - // performed in parallel. It's likely that detecting this pattern in a - // bottom-up phase will be simpler and less costly than building a - // full-blown top-down phase beginning at the consecutive loads. We process - // the bundle in chunks of 16 (like we do for stores) to minimize - // compile-time. - for (unsigned BI = 0, BE = Bundle.size(); BI < BE; BI += 16) { - auto Len = std::min<unsigned>(BE - BI, 16); - Changed |= tryToVectorizeList(makeArrayRef(&Bundle[BI], Len), R); - } - } - return Changed; -} - bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. - for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); + for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); it != e; ++it) { if (it->second.size() < 2) continue; |

