diff options
3 files changed, 284 insertions, 25 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 44b69c16e2b..aa683d0b3d0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15,22 +15,24 @@ // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Vectorize.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -45,7 +47,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Analysis/VectorUtils.h" +#include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <map> #include <memory> @@ -364,9 +366,9 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, - DominatorTree *Dt, AssumptionCache *AC) + DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); } @@ -400,6 +402,7 @@ public: BlockScheduling *BS = Iter.second.get(); BS->clear(); } + MinBWs.clear(); } /// \brief Perform LICM and CSE on the newly generated gather sequences. @@ -417,6 +420,10 @@ public: /// vectorization factors. unsigned getVectorElementSize(Value *V); + /// Compute the minimum type sizes required to represent the entries in a + /// vectorizable tree. + void computeMinimumValueSizes(); + private: struct TreeEntry; @@ -914,8 +921,14 @@ private: AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; + AssumptionCache *AC; + DemandedBits *DB; /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; + + /// A map of scalar integer values to the smallest bit width with which they + /// can legally be represented. + MapVector<Value *, uint64_t> MinBWs; }; #ifndef NDEBUG @@ -1471,6 +1484,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(VL[0])) + VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]), + VL.size()); + if (E->NeedToGather) { if (allConstant(VL)) return 0; @@ -1798,9 +1817,19 @@ int BoUpSLP::getTreeCost() { if (EphValues.count(EU.User)) continue; - VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - EU.Lane); + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + if (MinBWs.count(ScalarRoot)) { + auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); + VecTy = VectorType::get(MinTy, BundleWidth); + ExtractCost += + TTI->getCastInstrCost(Instruction::SExt, EU.Scalar->getType(), MinTy); + } + ExtractCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); } int SpillCost = getSpillCost(); @@ -2501,7 +2530,21 @@ Value *BoUpSLP::vectorizeTree() { } Builder.SetInsertPoint(&F->getEntryBlock().front()); - vectorizeTree(&VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); + + // If the vectorized tree can be rewritten in a smaller type, we truncate the + // vectorized root. InstCombine will then rewrite the entire expression. We + // sign extend the extracted values below. + auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + if (MinBWs.count(ScalarRoot)) { + if (auto *I = dyn_cast<Instruction>(VectorRoot)) + Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); + auto BundleWidth = VectorizableTree[0].Scalars.size(); + auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); + auto *VecTy = VectorType::get(MinTy, BundleWidth); + auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); + VectorizableTree[0].VectorizedValue = Trunc; + } DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2534,6 +2577,8 @@ Value *BoUpSLP::vectorizeTree() { if (PH->getIncomingValue(i) == Scalar) { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + if (MinBWs.count(ScalarRoot)) + Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, Ex); } @@ -2541,12 +2586,16 @@ Value *BoUpSLP::vectorizeTree() { } else { Builder.SetInsertPoint(cast<Instruction>(User)); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + if (MinBWs.count(ScalarRoot)) + Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(cast<Instruction>(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + if (MinBWs.count(ScalarRoot)) + Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); } @@ -3115,7 +3164,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { // If the current instruction is a load, update MaxWidth to reflect the // width of the loaded value. else if (isa<LoadInst>(I)) - MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty)); + MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty)); // Otherwise, we need to visit the operands of the instruction. We only // handle the interesting cases from buildTree here. If an operand is an @@ -3142,6 +3191,176 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { return MaxWidth; } +// Determine if a value V in a vectorizable expression Expr can be demoted to a +// smaller type with a truncation. We collect the values that will be demoted +// in ToDemote and additional roots that require investigating in Roots. +static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, + SmallVectorImpl<Value *> &ToDemote, + SmallVectorImpl<Value *> &Roots) { + + // We can always demote constants. + if (isa<Constant>(V)) { + ToDemote.push_back(V); + return true; + } + + // If the value is not an instruction in the expression with only one use, it + // cannot be demoted. + auto *I = dyn_cast<Instruction>(V); + if (!I || !I->hasOneUse() || !Expr.count(I)) + return false; + + switch (I->getOpcode()) { + + // We can always demote truncations and extensions. Since truncations can + // seed additional demotion, we save the truncated value. + case Instruction::Trunc: + Roots.push_back(I->getOperand(0)); + case Instruction::ZExt: + case Instruction::SExt: + break; + + // We can demote certain binary operations if we can demote both of their + // operands. + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) || + !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots)) + return false; + break; + + // We can demote selects if we can demote their true and false values. + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) || + !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots)) + return false; + break; + } + + // We can demote phis if we can demote all their incoming operands. Note that + // we don't need to worry about cycles since we ensure single use above. + case Instruction::PHI: { + PHINode *PN = cast<PHINode>(I); + for (Value *IncValue : PN->incoming_values()) + if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots)) + return false; + break; + } + + // Otherwise, conservatively give up. + default: + return false; + } + + // Record the value that we can demote. + ToDemote.push_back(V); + return true; +} + +void BoUpSLP::computeMinimumValueSizes() { + auto &DL = F->getParent()->getDataLayout(); + + // If there are no external uses, the expression tree must be rooted by a + // store. We can't demote in-memory values, so there is nothing to do here. + if (ExternalUses.empty()) + return; + + // We only attempt to truncate integer expressions. + auto &TreeRoot = VectorizableTree[0].Scalars; + auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType()); + if (!TreeRootIT) + return; + + // If the expression is not rooted by a store, these roots should have + // external uses. We will rely on InstCombine to rewrite the expression in + // the narrower type. However, InstCombine only rewrites single-use values. + // This means that if a tree entry other than a root is used externally, it + // must have multiple uses and InstCombine will not rewrite it. The code + // below ensures that only the roots are used externally. + SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end()); + for (auto &EU : ExternalUses) + if (!Expr.erase(EU.Scalar)) + return; + if (!Expr.empty()) + return; + + // Collect the scalar values of the vectorizable expression. We will use this + // context to determine which values can be demoted. If we see a truncation, + // we mark it as seeding another demotion. + for (auto &Entry : VectorizableTree) + Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end()); + + // Ensure the roots of the vectorizable tree don't form a cycle. They must + // have a single external user that is not in the vectorizable tree. + for (auto *Root : TreeRoot) + if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) + return; + + // Conservatively determine if we can actually truncate the roots of the + // expression. Collect the values that can be demoted in ToDemote and + // additional roots that require investigating in Roots. + SmallVector<Value *, 32> ToDemote; + SmallVector<Value *, 4> Roots; + for (auto *Root : TreeRoot) + if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) + return; + + // The maximum bit width required to represent all the values that can be + // demoted without loss of precision. It would be safe to truncate the roots + // of the expression to this width. + auto MaxBitWidth = 8u; + + // We first check if all the bits of the roots are demanded. If they're not, + // we can truncate the roots to this narrower type. + for (auto *Root : TreeRoot) { + auto Mask = DB->getDemandedBits(cast<Instruction>(Root)); + MaxBitWidth = std::max<unsigned>( + Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); + } + + // If all the bits of the roots are demanded, we can try a little harder to + // compute a narrower type. This can happen, for example, if the roots are + // getelementptr indices. InstCombine promotes these indices to the pointer + // width. Thus, all their bits are technically demanded even though the + // address computation might be vectorized in a smaller type. + // + // We start by looking at each entry that can be demoted. We compute the + // maximum bit width required to store the scalar by using ValueTracking to + // compute the number of high-order bits we can truncate. + if (MaxBitWidth == DL.getTypeSizeInBits(TreeRoot[0]->getType())) { + MaxBitWidth = 8u; + for (auto *Scalar : ToDemote) { + auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT); + auto NumTypeBits = DL.getTypeSizeInBits(Scalar->getType()); + MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); + } + } + + // Round MaxBitWidth up to the next power-of-two. + if (!isPowerOf2_64(MaxBitWidth)) + MaxBitWidth = NextPowerOf2(MaxBitWidth); + + // If the maximum bit width we compute is less than the with of the roots' + // type, we can proceed with the narrowing. Otherwise, do nothing. + if (MaxBitWidth >= TreeRootIT->getBitWidth()) + return; + + // If we can truncate the root, we must collect additional values that might + // be demoted as a result. That is, those seeded by truncations we will + // modify. + while (!Roots.empty()) + collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots); + + // Finally, map the values we can demote to the maximum bit with we computed. + for (auto *Scalar : ToDemote) + MinBWs[Scalar] = MaxBitWidth; +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { typedef SmallVector<StoreInst *, 8> StoreList; @@ -3163,6 +3382,7 @@ struct SLPVectorizer : public FunctionPass { LoopInfo *LI; DominatorTree *DT; AssumptionCache *AC; + DemandedBits *DB; bool runOnFunction(Function &F) override { if (skipOptnoneFunction(F)) @@ -3176,6 +3396,7 @@ struct SLPVectorizer : public FunctionPass { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + DB = &getAnalysis<DemandedBits>(); Stores.clear(); GEPs.clear(); @@ -3205,7 +3426,7 @@ struct SLPVectorizer : public FunctionPass { // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. @@ -3248,6 +3469,7 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired<TargetTransformInfoWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<DemandedBits>(); AU.addPreserved<LoopInfoWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<AAResultsWrapperPass>(); @@ -3352,6 +3574,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, ArrayRef<Value *> Operands = Chain.slice(i, VF); R.buildTree(Operands); + R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); @@ -3551,6 +3774,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, Value *ReorderedOps[] = { Ops[1], Ops[0] }; R.buildTree(ReorderedOps, None); } + R.computeMinimumValueSizes(); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { @@ -3817,6 +4041,7 @@ public: for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); + V.computeMinimumValueSizes(); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll index 59ceba1717a..9c06b24163a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -1,4 +1,5 @@ -; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=PROFITABLE +; RUN: opt -S -slp-vectorizer -slp-threshold=-12 -dce -instcombine < %s | FileCheck %s --check-prefix=UNPROFITABLE target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -18,13 +19,13 @@ target triple = "aarch64--linux-gnu" ; return sum; ; } -; CHECK-LABEL: @gather_reduce_8x16_i32 +; PROFITABLE-LABEL: @gather_reduce_8x16_i32 ; -; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; CHECK: zext <8 x i16> [[L]] to <8 x i32> -; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; CHECK: sext i32 [[X]] to i64 +; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> +; PROFITABLE: zext <8 x i16> [[L]] to <8 x i32> +; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> +; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] +; PROFITABLE: sext i32 [[X]] to i64 ; define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { entry: @@ -137,14 +138,18 @@ for.body: br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body } -; CHECK-LABEL: @gather_reduce_8x16_i64 +; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64 ; -; CHECK-NOT: load <8 x i16> +; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> +; UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32> +; UNPROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> +; UNPROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] +; UNPROFITABLE: sext i32 [[X]] to i64 ; -; FIXME: We are currently unable to vectorize the case with i64 subtraction -; because the zero extensions are too expensive. The solution here is to -; convert the i64 subtractions to i32 subtractions during vectorization. -; This would then match the case above. +; TODO: Although we can now vectorize this case while converting the i64 +; subtractions to i32, the cost model currently finds vectorization to be +; unprofitable. The cost model is penalizing the sign and zero +; extensions in the vectorized version, but they are actually free. ; define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll index 03175096783..7e1d6709566 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll @@ -24,3 +24,32 @@ for.cond4: for.end11: ret void } + +; This test ensures that we do not regress due to PR26629. We must look at +; every root in the vectorizable tree when computing minimum sizes since one +; root may require fewer bits than another. +; +; CHECK-LABEL: @PR26629 +; CHECK-NOT: {{.*}} and <2 x i72> +; +define void @PR26629(i32* %c) { +entry: + br i1 undef, label %for.ph, label %for.end + +for.ph: + %0 = load i32, i32* %c, align 4 + br label %for.body + +for.body: + %d = phi i72 [ 576507472957710340, %for.ph ], [ %bf.set17, %for.body ] + %sub = sub i32 %0, undef + %bf.clear13 = and i72 %d, -576460748008464384 + %1 = zext i32 %sub to i72 + %bf.value15 = and i72 %1, 8191 + %bf.clear16 = or i72 %bf.value15, %bf.clear13 + %bf.set17 = or i72 %bf.clear16, undef + br label %for.body + +for.end: + ret void +} |