diff options
author | Sanjay Patel <spatel@rotateright.com> | 2015-07-08 23:40:55 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2015-07-08 23:40:55 +0000 |
commit | 13194461958672e09a4b0608186c7793a6aaf241 (patch) | |
tree | 04f05efaf789d66363984fe07b0d4c978689a28c /llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | |
parent | 91e85d4327856aa9bdb0236caeb31c95099afa65 (diff) | |
download | bcm5719-llvm-13194461958672e09a4b0608186c7793a6aaf241.tar.gz bcm5719-llvm-13194461958672e09a4b0608186c7793a6aaf241.zip |
[SLPVectorizer] Try different vectorization factors for store chains
...and set max vector register size based on target
This patch is based on discussion on the llvmdev mailing list:
http://lists.cs.uiuc.edu/pipermail/llvmdev/2015-July/087405.html
and also solves:
https://llvm.org/bugs/show_bug.cgi?id=17170
Several FIXME/TODO items are noted in comments as potential improvements.
Differential Revision: http://reviews.llvm.org/D10950
llvm-svn: 241760
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 44 |
1 files changed, 37 insertions, 7 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index da35b35248f..7bac407e77e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -69,8 +69,13 @@ static cl::opt<bool> ShouldStartVectorizeHorAtStore( cl::desc( "Attempt to vectorize horizontal reductions feeding into a store")); +static cl::opt<int> +MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, + cl::desc("Attempt to vectorize for this register size in bits")); + namespace { +// FIXME: Set this via cl::opt to allow overriding. static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 12; @@ -3088,6 +3093,17 @@ struct SLPVectorizer : public FunctionPass { if (!TTI->getNumberOfRegisters(true)) return false; + // Use the vector register size specified by the target unless overridden + // by a command-line option. + // TODO: It would be better to limit the vectorization factor based on + // data type rather than just register size. For example, x86 AVX has + // 256-bit registers, but it does not support integer operations + // at that width (that requires AVX2). + if (MaxVectorRegSizeOption.getNumOccurrences()) + MaxVecRegSize = MaxVectorRegSizeOption; + else + MaxVecRegSize = TTI->getRegisterBitWidth(true); + // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) return false; @@ -3165,12 +3181,13 @@ private: bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold, - BoUpSLP &R); + BoUpSLP &R, unsigned VecRegSize); bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, BoUpSLP &R); private: StoreListMap StoreRefs; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; /// \brief Check that the Values in the slice in VL array are still existent in @@ -3185,14 +3202,15 @@ static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH, } bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain, - int CostThreshold, BoUpSLP &R) { + int CostThreshold, BoUpSLP &R, + unsigned VecRegSize) { unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout(); unsigned Sz = DL.getTypeSizeInBits(StoreTy); - unsigned VF = MinVecRegSize / Sz; + unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) return false; @@ -3276,10 +3294,15 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores, I = ConsecutiveChain[I]; } - if (vectorizeStoreChain(Operands, costThreshold, R)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed = true; + // FIXME: Is division-by-2 the correct step? Should we assert that the + // register size is a power-of-2? + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (vectorizeStoreChain(Operands, costThreshold, R, Size)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed = true; + break; + } } } @@ -3340,6 +3363,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, Type *Ty0 = I0->getType(); unsigned Sz = DL.getTypeSizeInBits(Ty0); + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. unsigned VF = MinVecRegSize / Sz; for (Value *V : VL) { @@ -3569,6 +3594,8 @@ public: const DataLayout &DL = B->getModule()->getDataLayout(); ReductionOpcode = B->getOpcode(); ReducedValueOpcode = 0; + // FIXME: Register size should be a parameter to this function, so we can + // try different vectorization factors. ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -3995,6 +4022,9 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { << it->second.size() << ".\n"); // Process the stores in chunks of 16. + // TODO: The limit of 16 inhibits greater vectorization factors. + // For example, AVX2 supports v32i8. Increasing this limit, however, + // may cause a significant compile-time increase. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) { unsigned Len = std::min<unsigned>(CE - CI, 16); Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), |