summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h6
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp265
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll161
3 files changed, 258 insertions, 174 deletions
diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 4886700774c..d669a8e5b61 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -92,6 +92,12 @@ private:
/// collected in GEPs.
bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
+ /// Try to find horizontal reduction or otherwise vectorize a chain of binary
+ /// operators.
+ bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB,
+ slpvectorizer::BoUpSLP &R,
+ TargetTransformInfo *TTI);
+
/// \brief Scan the basic block and look for patterns that are likely to start
/// a vectorization chain.
bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9b215480d18..45cfa24f283 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3969,36 +3969,40 @@ bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
if (!V)
return false;
+ Value *P = V->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+ return false;
+
// Try to vectorize V.
- if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+ if (tryToVectorizePair(Op0, Op1, R))
return true;
- BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
- BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
// Try to skip B.
if (B && B->hasOneUse()) {
- BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (tryToVectorizePair(A, B0, R)) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
return true;
- }
- if (tryToVectorizePair(A, B1, R)) {
+ if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
return true;
- }
}
// Try to skip A.
if (A && A->hasOneUse()) {
- BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (tryToVectorizePair(A0, B, R)) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
return true;
- }
- if (tryToVectorizePair(A1, B, R)) {
+ if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
return true;
- }
}
- return 0;
+ return false;
}
/// \brief Generate a shuffle mask to be used in a reduction tree.
@@ -4449,29 +4453,143 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
return nullptr;
}
+namespace {
+/// Tracks instructons and its children.
+class WeakVHWithLevel final : public CallbackVH {
+ /// Operand index of the instruction currently beeing analized.
+ unsigned Level = 0;
+ /// Is this the instruction that should be vectorized, or are we now
+ /// processing children (i.e. operands of this instruction) for potential
+ /// vectorization?
+ bool IsInitial = true;
+
+public:
+ explicit WeakVHWithLevel() = default;
+ WeakVHWithLevel(Value *V) : CallbackVH(V){};
+ /// Restart children analysis each time it is repaced by the new instruction.
+ void allUsesReplacedWith(Value *New) override {
+ setValPtr(New);
+ Level = 0;
+ IsInitial = true;
+ }
+ /// Check if the instruction was not deleted during vectorization.
+ bool isValid() const { return !getValPtr(); }
+ /// Is the istruction itself must be vectorized?
+ bool isInitial() const { return IsInitial; }
+ /// Try to vectorize children.
+ void clearInitial() { IsInitial = false; }
+ /// Are all children processed already?
+ bool isFinal() const {
+ assert(getValPtr() &&
+ (isa<Instruction>(getValPtr()) &&
+ cast<Instruction>(getValPtr())->getNumOperands() >= Level));
+ return getValPtr() &&
+ cast<Instruction>(getValPtr())->getNumOperands() == Level;
+ }
+ /// Get next child operation.
+ Value *nextOperand() {
+ assert(getValPtr() && isa<Instruction>(getValPtr()) &&
+ cast<Instruction>(getValPtr())->getNumOperands() > Level);
+ return cast<Instruction>(getValPtr())->getOperand(Level++);
+ }
+ virtual ~WeakVHWithLevel() = default;
+};
+} // namespace
+
/// \brief Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding
-/// the phi node P with reduction operators BI, then check if it
-/// can be done.
+/// the phi node P with reduction operators Root in a basic block BB, then check
+/// if it can be done.
/// \returns true if a horizontal reduction was matched and reduced.
/// \returns false if a horizontal reduction was not matched.
-static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
- BoUpSLP &R, TargetTransformInfo *TTI,
- unsigned MinRegSize) {
+static bool canBeVectorized(
+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI,
+ const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
if (!ShouldVectorizeHor)
return false;
- HorizontalReduction HorRdx(MinRegSize);
- if (!HorRdx.matchAssociativeReduction(P, BI))
+ if (!Root)
+ return false;
+
+ if (Root->getParent() != BB)
return false;
+ SmallVector<WeakVHWithLevel, 8> Stack(1, Root);
+ SmallSet<Value *, 8> VisitedInstrs;
+ bool Res = false;
+ while (!Stack.empty()) {
+ Value *V = Stack.back();
+ if (!V) {
+ Stack.pop_back();
+ continue;
+ }
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || isa<PHINode>(Inst)) {
+ Stack.pop_back();
+ continue;
+ }
+ if (Stack.back().isInitial()) {
+ Stack.back().clearInitial();
+ if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
+ HorizontalReduction HorRdx(R.getMinVecRegSize());
+ if (HorRdx.matchAssociativeReduction(P, BI)) {
+ // If there is a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. Can safely generate oversized
+ // vectors and rely on the backend to split them to legal sizes.
+ HorRdx.ReduxWidth =
+ std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+
+ if (HorRdx.tryToReduce(R, TTI)) {
+ Res = true;
+ P = nullptr;
+ continue;
+ }
+ }
+ if (P) {
+ Inst = dyn_cast<Instruction>(BI->getOperand(0));
+ if (Inst == P)
+ Inst = dyn_cast<Instruction>(BI->getOperand(1));
+ if (!Inst) {
+ P = nullptr;
+ continue;
+ }
+ }
+ }
+ P = nullptr;
+ if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+ Res = true;
+ continue;
+ }
+ }
+ if (Stack.back().isFinal()) {
+ Stack.pop_back();
+ continue;
+ }
- // If there is a sufficient number of reduction values, reduce
- // to a nearby power-of-2. Can safely generate oversized
- // vectors and rely on the backend to split them to legal sizes.
- HorRdx.ReduxWidth =
- std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
+ if (auto *NextV = dyn_cast<Instruction>(Stack.back().nextOperand()))
+ if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second &&
+ Stack.size() < RecursionMaxDepth)
+ Stack.push_back(NextV);
+ }
+ return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+ BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI) {
+ if (!V)
+ return false;
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
- return HorRdx.tryToReduce(R, TTI);
+ if (!isa<BinaryOperator>(I))
+ P = nullptr;
+ // Try to match and vectorize a horizontal reduction.
+ return canBeVectorized(P, I, BB, R, TTI,
+ [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
+ return tryToVectorize(BI, R);
+ });
}
bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
@@ -4541,67 +4659,42 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (P->getNumIncomingValues() != 2)
return Changed;
- Value *Rdx = getReductionValue(DT, P, BB, LI);
-
- // Check if this is a Binary Operator.
- BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
- if (!BI)
- continue;
-
// Try to match and vectorize a horizontal reduction.
- if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) {
+ if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+ TTI)) {
Changed = true;
it = BB->begin();
e = BB->end();
continue;
}
-
- Value *Inst = BI->getOperand(0);
- if (Inst == P)
- Inst = BI->getOperand(1);
-
- if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
-
continue;
}
- if (ShouldStartVectorizeHorAtStore)
- if (StoreInst *SI = dyn_cast<StoreInst>(it))
- if (BinaryOperator *BinOp =
- dyn_cast<BinaryOperator>(SI->getValueOperand())) {
- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
- R.getMinVecRegSize()) ||
- tryToVectorize(BinOp, R)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
+ if (ShouldStartVectorizeHorAtStore) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
+ TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
}
+ }
+ }
// Try to vectorize horizontal reductions feeding into a return.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
- if (RI->getNumOperands() != 0)
- if (BinaryOperator *BinOp =
- dyn_cast<BinaryOperator>(RI->getOperand(0))) {
- DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
- if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
- R.getMinVecRegSize()) ||
- tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1),
- R)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
+ if (RI->getNumOperands() != 0) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
}
+ }
+ }
// Try to vectorize trees that start at compare instructions.
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
@@ -4614,16 +4707,14 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
}
- for (int i = 0; i < 2; ++i) {
- if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
- if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
- Changed = true;
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- it = BB->begin();
- e = BB->end();
- break;
- }
+ for (int I = 0; I < 2; ++I) {
+ if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
+ Changed = true;
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ it = BB->begin();
+ e = BB->end();
+ break;
}
}
continue;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index ce20b45753c..faae28f4fc6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -11,26 +11,25 @@ define float @baz() {
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0
-; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP8]], [[ADD_1]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1
-; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP9]], [[ADD_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
-; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]]
-; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD19]]
-; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP8]], [[ADD19_1]]
-; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP9]], [[ADD19_2]]
+; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
+; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
+; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
+; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
; CHECK-NEXT: ret float [[ADD19_3]]
;
@@ -68,40 +67,37 @@ define float @bazz() {
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
-; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4), align 16
-; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4), align 16
-; CHECK-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
-; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 5), align 4
-; CHECK-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 5), align 4
-; CHECK-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
-; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
-; CHECK-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
-; CHECK-NEXT: [[TMP14:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
-; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x float> [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP16]], [[ADD19_1]]
-; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP17]], [[ADD19_2]]
+; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16
+; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]]
+; CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
+; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
+; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]]
+; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
+; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]]
; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
; CHECK-NEXT: ret float [[ADD19_3]]
;
@@ -152,24 +148,20 @@ define float @bazzz() {
; CHECK-LABEL: @bazzz(
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
-; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
-; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
-; CHECK-NEXT: store float [[TMP12]], float* @res, align 4
-; CHECK-NEXT: ret float [[TMP12]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; CHECK-NEXT: store float [[TMP8]], float* @res, align 4
+; CHECK-NEXT: ret float [[TMP8]]
;
entry:
%0 = load i32, i32* @n, align 4
@@ -198,23 +190,19 @@ define i32 @foo() {
; CHECK-LABEL: @foo(
; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
-; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
-; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]]
-; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
-; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]]
-; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP12]] to i32
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
; CHECK-NEXT: store i32 [[CONV4]], i32* @n, align 4
; CHECK-NEXT: ret i32 [[CONV4]]
;
@@ -244,8 +232,7 @@ entry:
define float @bar() {
; CHECK-LABEL: @bar(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
OpenPOWER on IntegriCloud