diff options
-rw-r--r-- | llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h | 5 | ||||
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 37 | ||||
-rw-r--r-- | llvm/test/Feature/weak_constant.ll | 2 | ||||
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/X86/reduction.ll | 19 | ||||
-rw-r--r-- | llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll | 19 |
5 files changed, 29 insertions, 53 deletions
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 4004bcf5424..237781dfe22 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -114,12 +114,9 @@ private: /// Try to find horizontal reduction or otherwise vectorize a chain of binary /// operators. - /// \p Try2WayRdx specializes the analysis to only attempt a 2-element - /// reduction. bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, slpvectorizer::BoUpSLP &R, - TargetTransformInfo *TTI, - bool Try2WayRdx = false); + TargetTransformInfo *TTI); /// Try to vectorize trees that start at insertvalue instructions. bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index df40afd88cc..57a17e03583 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6653,7 +6653,7 @@ public: /// Attempt to vectorize the tree found by /// matchAssociativeReduction. - bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI, bool Try2WayRdx) { + bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { if (ReducedVals.empty()) return false; @@ -6661,14 +6661,11 @@ public: // to a nearby power-of-2. Can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); - if (Try2WayRdx && NumReducedVals != 2) - return false; - unsigned MinRdxVals = Try2WayRdx ? 2 : 4; - if (NumReducedVals < MinRdxVals) + if (NumReducedVals < 4) return false; unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); - unsigned MinRdxWidth = Log2_32(MinRdxVals); + Value *VectorizedTree = nullptr; // FIXME: Fast-math-flags should be set based on the instructions in the @@ -6704,7 +6701,7 @@ public: SmallVector<Value *, 16> IgnoreList; for (auto &V : ReductionOps) IgnoreList.append(V.begin(), V.end()); - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) { + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional<ArrayRef<unsigned>> Order = V.bestOrder(); @@ -7048,7 +7045,7 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P, /// performed. static bool tryToVectorizeHorReductionOrInstOperands( PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, bool Try2WayRdx, + TargetTransformInfo *TTI, const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) { if (!ShouldVectorizeHor) return false; @@ -7079,7 +7076,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( if (BI || SI) { HorizontalReduction HorRdx; if (HorRdx.matchAssociativeReduction(P, Inst)) { - if (HorRdx.tryToReduce(R, TTI, Try2WayRdx)) { + if (HorRdx.tryToReduce(R, TTI)) { Res = true; // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. @@ -7122,8 +7119,7 @@ static bool tryToVectorizeHorReductionOrInstOperands( bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, - bool Try2WayRdx) { + TargetTransformInfo *TTI) { if (!V) return false; auto *I = dyn_cast<Instruction>(V); @@ -7136,7 +7132,7 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { return tryToVectorize(I, R); }; - return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, Try2WayRdx, + return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, ExtraVectorization); } @@ -7332,23 +7328,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { PostProcessInstructions.push_back(&*it); } - // Make a final attempt to match a 2-way reduction if nothing else worked. - // We do not try this above because it may interfere with other vectorization - // attempts. - // TODO: The constraints are copied from the above call to - // vectorizeRootInstruction(), but that might be too restrictive? - BasicBlock::iterator LastInst = --BB->end(); - if (!Changed && LastInst->use_empty() && - (LastInst->getType()->isVoidTy() || isa<CallInst>(LastInst) || - isa<InvokeInst>(LastInst))) { - if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(LastInst)) { - for (auto *V : LastInst->operand_values()) { - Changed |= vectorizeRootInstruction(nullptr, V, BB, R, TTI, - /* Try2WayRdx */ true); - } - } - } - return Changed; } diff --git a/llvm/test/Feature/weak_constant.ll b/llvm/test/Feature/weak_constant.ll index 9a2ea126ebc..4ac2e7e7d68 100644 --- a/llvm/test/Feature/weak_constant.ll +++ b/llvm/test/Feature/weak_constant.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -O3 -S > %t -; RUN: grep undef %t | count 2 +; RUN: grep undef %t | count 1 ; RUN: grep 5 %t | count 1 ; RUN: grep 7 %t | count 1 ; RUN: grep 9 %t | count 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll index 0b7e9ef6c17..9060a1e32d2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll @@ -129,16 +129,15 @@ define i32 @horiz_max_multiple_uses([32 x i32]* %x, i32* %p) { define i1 @bad_insertpoint_rdx([8 x i32]* %p) #0 { ; CHECK-LABEL: @bad_insertpoint_rdx( ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[P:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX22]] to <2 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 16 -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 undef> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], 0 -; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[SPEC_STORE_SELECT87:%.*]] = zext i1 [[TMP4]] to i32 -; CHECK-NEXT: [[CMP23_2:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT87]], [[OP_EXTRA]] +; CHECK-NEXT: [[T0:%.*]] = load i32, i32* [[ARRAYIDX22]], align 16 +; CHECK-NEXT: [[CMP23:%.*]] = icmp sgt i32 [[T0]], 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP23]], i32 [[T0]], i32 0 +; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[P]], i64 0, i64 1 +; CHECK-NEXT: [[T1:%.*]] = load i32, i32* [[ARRAYIDX22_1]], align 4 +; CHECK-NEXT: [[CMP23_1:%.*]] = icmp sgt i32 [[T1]], [[SPEC_SELECT]] +; CHECK-NEXT: [[SPEC_STORE_SELECT87:%.*]] = zext i1 [[CMP23_1]] to i32 +; CHECK-NEXT: [[SPEC_SELECT88:%.*]] = select i1 [[CMP23_1]], i32 [[T1]], i32 [[SPEC_SELECT]] +; CHECK-NEXT: [[CMP23_2:%.*]] = icmp sgt i32 [[SPEC_STORE_SELECT87]], [[SPEC_SELECT88]] ; CHECK-NEXT: ret i1 [[CMP23_2]] ; %arrayidx22 = getelementptr inbounds [8 x i32], [8 x i32]* %p, i64 0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll index fef9a8e50cd..b5f43354927 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -54,10 +54,10 @@ define double @foo(double* nocapture %D) { define i1 @two_wide_fcmp_reduction(<2 x double> %a0) { ; CHECK-LABEL: @two_wide_fcmp_reduction( ; CHECK-NEXT: [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], <double 1.000000e+00, double 1.000000e+00> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0 +; CHECK-NEXT: [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1 +; CHECK-NEXT: [[D:%.*]] = and i1 [[B]], [[C]] +; CHECK-NEXT: ret i1 [[D]] ; %a = fcmp ogt <2 x double> %a0, <double 1.0, double 1.0> %b = extractelement <2 x i1> %a, i32 0 @@ -96,11 +96,12 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) { ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], <double 0x3EB0C6F7A0B5ED8D, double 0x3EB0C6F7A0B5ED8D> -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> <i32 1, i32 undef> -; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: ; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], <double 1.000000e+00, double 1.000000e+00> ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 |