diff options
| author | Matthew Simpson <mssimpso@codeaurora.org> | 2016-12-12 21:11:04 +0000 |
|---|---|---|
| committer | Matthew Simpson <mssimpso@codeaurora.org> | 2016-12-12 21:11:04 +0000 |
| commit | 92ce0230b54de190e3b5f3dfedb0ed043b59a612 (patch) | |
| tree | 03c32cabb42a1243870e29a5cd36327e04b1c0d7 | |
| parent | 035af9b346556edbb116dae63836f8b69c99c57c (diff) | |
| download | bcm5719-llvm-92ce0230b54de190e3b5f3dfedb0ed043b59a612.tar.gz bcm5719-llvm-92ce0230b54de190e3b5f3dfedb0ed043b59a612.zip | |
[SLP] Fix sign-extends for type-shrinking
This patch ensures the correct minimum bit width during type-shrinking.
Previously when type-shrinking, we always sign-extended values back to their
original width. However, if we are going to sign-extend, and the sign bit is
unknown, we have to increase the minimum bit width by one bit so the
sign-extend will fill the upper bits correctly. If the sign bit is known to be
zero, we can perform a zero-extend instead. This should fix PR31243.
Reference: https://llvm.org/bugs/show_bug.cgi?id=31243
Differential Revision: https://reviews.llvm.org/D27466
llvm-svn: 289470
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 77 | ||||
| -rw-r--r-- | llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll | 72 |
2 files changed, 134 insertions, 15 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 867de25841b..bcaa8439cff 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -910,8 +910,11 @@ private: IRBuilder<> Builder; /// A map of scalar integer values to the smallest bit width with which they - /// can legally be represented. - MapVector<Value *, uint64_t> MinBWs; + /// can legally be represented. The values map to (width, signed) pairs, + /// where "width" indicates the minimum bit width and "signed" is True if the + /// value must be signed-extended, rather than zero-extended, back to its + /// original width. + MapVector<Value *, std::pair<uint64_t, bool>> MinBWs; }; } // end namespace llvm @@ -1572,8 +1575,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. if (MinBWs.count(VL[0])) - VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]), - VL.size()); + VecTy = VectorType::get( + IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); if (E->NeedToGather) { if (allConstant(VL)) @@ -1929,10 +1932,12 @@ int BoUpSLP::getTreeCost() { auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); auto *ScalarRoot = VectorizableTree[0].Scalars[0]; if (MinBWs.count(ScalarRoot)) { - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); + auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); + auto Extend = + MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt; VecTy = VectorType::get(MinTy, BundleWidth); - ExtractCost += TTI->getExtractWithExtendCost( - Instruction::SExt, EU.Scalar->getType(), VecTy, EU.Lane); + ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), + VecTy, EU.Lane); } else { ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); @@ -2718,7 +2723,7 @@ Value *BoUpSLP::vectorizeTree() { if (auto *I = dyn_cast<Instruction>(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); auto BundleWidth = VectorizableTree[0].Scalars.size(); - auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]); + auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto *VecTy = VectorType::get(MinTy, BundleWidth); auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy); VectorizableTree[0].VectorizedValue = Trunc; @@ -2726,6 +2731,16 @@ Value *BoUpSLP::vectorizeTree() { DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); + // If necessary, sign-extend or zero-extend ScalarRoot to the larger type + // specified by ScalarType. + auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) { + if (!MinBWs.count(ScalarRoot)) + return Ex; + if (MinBWs[ScalarRoot].second) + return Builder.CreateSExt(Ex, ScalarType); + return Builder.CreateZExt(Ex, ScalarType); + }; + // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -2760,8 +2775,7 @@ Value *BoUpSLP::vectorizeTree() { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); } Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, Ex); } @@ -2769,16 +2783,14 @@ Value *BoUpSLP::vectorizeTree() { } else { Builder.SetInsertPoint(cast<Instruction>(User)); Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(cast<Instruction>(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); - if (MinBWs.count(ScalarRoot)) - Ex = Builder.CreateSExt(Ex, Scalar->getType()); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); } @@ -3499,6 +3511,11 @@ void BoUpSLP::computeMinimumValueSizes() { Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); } + // True if the roots can be zero-extended back to their original type, rather + // than sign-extended. We know that if the leading bits are not demanded, we + // can safely zero-extend. So we initialize IsKnownPositive to True. + bool IsKnownPositive = true; + // If all the bits of the roots are demanded, we can try a little harder to // compute a narrower type. This can happen, for example, if the roots are // getelementptr indices. InstCombine promotes these indices to the pointer @@ -3510,11 +3527,41 @@ void BoUpSLP::computeMinimumValueSizes() { // compute the number of high-order bits we can truncate. if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) { MaxBitWidth = 8u; + + // Determine if the sign bit of all the roots is known to be zero. If not, + // IsKnownPositive is set to False. + IsKnownPositive = all_of(TreeRoot, [&](Value *R) { + bool KnownZero = false; + bool KnownOne = false; + ComputeSignBit(R, KnownZero, KnownOne, *DL); + return KnownZero; + }); + + // Determine the maximum number of bits required to store the scalar + // values. for (auto *Scalar : ToDemote) { auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT); auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType()); MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth); } + + // If we can't prove that the sign bit is zero, we must add one to the + // maximum bit width to account for the unknown sign bit. This preserves + // the existing sign bit so we can safely sign-extend the root back to the + // original type. Otherwise, if we know the sign bit is zero, we will + // zero-extend the root instead. + // + // FIXME: This is somewhat suboptimal, as there will be cases where adding + // one to the maximum bit width will yield a larger-than-necessary + // type. In general, we need to add an extra bit only if we can't + // prove that the upper bit of the original type is equal to the + // upper bit of the proposed smaller type. If these two bits are the + // same (either zero or one) we know that sign-extending from the + // smaller type will result in the same value. Here, since we can't + // yet prove this, we are just making the proposed smaller type + // larger to ensure correctness. + if (!IsKnownPositive) + ++MaxBitWidth; } // Round MaxBitWidth up to the next power-of-two. @@ -3534,7 +3581,7 @@ void BoUpSLP::computeMinimumValueSizes() { // Finally, map the values we can demote to the maximum bit with we computed. for (auto *Scalar : ToDemote) - MinBWs[Scalar] = MaxBitWidth; + MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive); } namespace { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll new file mode 100644 index 00000000000..00af8ff4a2d --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll @@ -0,0 +1,72 @@ +; RUN: opt -S -slp-threshold=-6 -slp-vectorizer -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; These tests ensure that we do not regress due to PR31243. Note that we set +; the SLP threshold to force vectorization even when not profitable. + +; CHECK-LABEL: @PR31243_zext +; +; When computing minimum sizes, if we can prove the sign bit is zero, we can +; zero-extend the roots back to their original sizes. +; +; CHECK: %[[OR:.+]] = or <2 x i8> {{.*}}, <i8 1, i8 1> +; CHECK: %[[E0:.+]] = extractelement <2 x i8> %[[OR]], i32 0 +; CHECK: %[[Z0:.+]] = zext i8 %[[E0]] to i64 +; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[Z0]] +; CHECK: %[[E1:.+]] = extractelement <2 x i8> %[[OR]], i32 1 +; CHECK: %[[Z1:.+]] = zext i8 %[[E1]] to i64 +; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[Z1]] +; +define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { +entry: + %tmp0 = zext i8 %v0 to i32 + %tmp1 = zext i8 %v1 to i32 + %tmp2 = or i32 %tmp0, 1 + %tmp3 = or i32 %tmp1, 1 + %tmp4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp2 + %tmp5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp3 + %tmp6 = load i8, i8* %tmp4 + %tmp7 = load i8, i8* %tmp5 + %tmp8 = add i8 %tmp6, %tmp7 + ret i8 %tmp8 +} + +; CHECK-LABEL: @PR31243_sext +; +; When computing minimum sizes, if we cannot prove the sign bit is zero, we +; have to include one extra bit for signedness since we will sign-extend the +; roots. +; +; FIXME: This test is suboptimal since the compuation can be performed in i8. +; In general, we need to add an extra bit to the maximum bit width only +; if we can't prove that the upper bit of the original type is equal to +; the upper bit of the proposed smaller type. If these two bits are the +; same (either zero or one) we know that sign-extending from the smaller +; type will result in the same value. Since we don't yet perform this +; optimization, we make the proposed smaller type (i8) larger (i16) to +; ensure correctness. +; +; CHECK: %[[S0:.+]] = sext <2 x i8> {{.*}} to <2 x i16> +; CHECK: %[[OR:.+]] = or <2 x i16> %[[S0]], <i16 1, i16 1> +; CHECK: %[[E0:.+]] = extractelement <2 x i16> %[[OR]], i32 0 +; CHECK: %[[S1:.+]] = sext i16 %[[E0]] to i64 +; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[S1]] +; CHECK: %[[E1:.+]] = extractelement <2 x i16> %[[OR]], i32 1 +; CHECK: %[[S2:.+]] = sext i16 %[[E1]] to i64 +; CHECK: getelementptr inbounds i8, i8* %ptr, i64 %[[S2]] +; +define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) { +entry: + %tmp0 = sext i8 %v0 to i32 + %tmp1 = sext i8 %v1 to i32 + %tmp2 = or i32 %tmp0, 1 + %tmp3 = or i32 %tmp1, 1 + %tmp4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp2 + %tmp5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp3 + %tmp6 = load i8, i8* %tmp4 + %tmp7 = load i8, i8* %tmp5 + %tmp8 = add i8 %tmp6, %tmp7 + ret i8 %tmp8 +} |

