summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJames Molloy <james.molloy@arm.com>2015-09-09 12:51:06 +0000
committerJames Molloy <james.molloy@arm.com>2015-09-09 12:51:06 +0000
commit89eccee4db2601d720c7dabf89cb86f916d12e9c (patch)
treefa5c7231902c9857ef51d2f31d711b9b34479225
parent3834d2ca087cd80a106127f4dc4b5ed90b2084dd (diff)
downloadbcm5719-llvm-89eccee4db2601d720c7dabf89cb86f916d12e9c.tar.gz
bcm5719-llvm-89eccee4db2601d720c7dabf89cb86f916d12e9c.zip
Delay predication of stores until near the end of vector code generation
Predicating stores requires creating extra blocks. It's much cleaner if we do this in one pass instead of mutating the CFG while writing vector instructions. Besides which we can make use of helper functions to update domtree for us, reducing the work we need to do. llvm-svn: 247139
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp84
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-pred-stores.ll43
2 files changed, 62 insertions, 65 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index dd9edf442a5..120f094307c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -285,8 +285,6 @@ public:
// Widen each instruction in the old loop to a new one in the new loop.
// Use the Legality module to find the induction and reduction variables.
vectorizeLoop();
- // Register the new loop and update the analysis passes.
- updateAnalysis();
}
// Return true if any runtime check is added.
@@ -490,6 +488,9 @@ protected:
PHINode *OldInduction;
/// Maps scalars to widened vectors.
ValueMap WidenMap;
+ /// Store instructions that should be predicated, as a pair
+ /// <StoreInst, Predicate>
+ SmallVector<std::pair<StoreInst*,Value*>, 4> PredicatedStores;
EdgeMaskCache MaskCache;
/// Trip count of the original loop.
Value *TripCount;
@@ -2472,19 +2473,12 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
- Instruction *InsertPt = Builder.GetInsertPoint();
- BasicBlock *IfBlock = Builder.GetInsertBlock();
- BasicBlock *CondBlock = nullptr;
-
VectorParts Cond;
- Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
- VectorLp = LI->getLoopFor(IfBlock);
- assert(VectorLp && "Must have a loop for this block");
}
// For each vector unroll 'part':
@@ -2497,11 +2491,6 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
if (IfPredicateStore) {
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
- CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
- LoopVectorBody.push_back(CondBlock);
- VectorLp->addBasicBlockToLoop(CondBlock, *LI);
- // Update Builder with newly created basic block.
- Builder.SetInsertPoint(InsertPt);
}
Instruction *Cloned = Instr->clone();
@@ -2525,15 +2514,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredic
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
// End if-block.
- if (IfPredicateStore) {
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- LoopVectorBody.push_back(NewIfBlock);
- VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
- Builder.SetInsertPoint(InsertPt);
- ReplaceInstWithInst(IfBlock->getTerminator(),
- BranchInst::Create(CondBlock, NewIfBlock, Cmp));
- IfBlock = NewIfBlock;
- }
+ if (IfPredicateStore)
+ PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+ Cmp));
}
}
}
@@ -3367,6 +3350,20 @@ void InnerLoopVectorizer::vectorizeLoop() {
fixLCSSAPHIs();
+ // Make sure DomTree is updated.
+ updateAnalysis();
+
+ // Predicate any stores.
+ for (auto KV : PredicatedStores) {
+ BasicBlock::iterator I(KV.first);
+ auto *BB = SplitBlock(I->getParent(), std::next(I), DT, LI);
+ auto *T = SplitBlockAndInsertIfThen(KV.second, I, /*Unreachable=*/false,
+ /*BranchWeights=*/nullptr, DT);
+ I->moveBefore(T);
+ I->getParent()->setName("pred.store.if");
+ BB->setName("pred.store.continue");
+ }
+ DEBUG(DT->verifyDomTree());
// Remove redundant induction instructions.
cse(LoopVectorBody);
}
@@ -3805,17 +3802,10 @@ void InnerLoopVectorizer::updateAnalysis() {
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
- // Due to if predication of stores we might create a sequence of "if(pred)
- // a[i] = ...; " blocks.
- for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
- if (i == 0)
- DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
- else if (isPredicatedBlock(i)) {
- DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
- } else {
- DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
- }
- }
+ // We don't predicate stores by this point, so the vector body should be a
+ // single loop.
+ assert(LoopVectorBody.size() == 1 && "Expected single block loop!");
+ DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
@@ -5412,19 +5402,12 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
- Instruction *InsertPt = Builder.GetInsertPoint();
- BasicBlock *IfBlock = Builder.GetInsertBlock();
- BasicBlock *CondBlock = nullptr;
-
VectorParts Cond;
- Loop *VectorLp = nullptr;
if (IfPredicateStore) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
Instr->getParent());
- VectorLp = LI->getLoopFor(IfBlock);
- assert(VectorLp && "Must have a loop for this block");
}
// For each vector unroll 'part':
@@ -5439,11 +5422,6 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
ConstantInt::get(Cond[Part]->getType(), 1));
- CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
- LoopVectorBody.push_back(CondBlock);
- VectorLp->addBasicBlockToLoop(CondBlock, *LI);
- // Update Builder with newly created basic block.
- Builder.SetInsertPoint(InsertPt);
}
Instruction *Cloned = Instr->clone();
@@ -5463,16 +5441,10 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
- // End if-block.
- if (IfPredicateStore) {
- BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
- LoopVectorBody.push_back(NewIfBlock);
- VectorLp->addBasicBlockToLoop(NewIfBlock, *LI);
- Builder.SetInsertPoint(InsertPt);
- ReplaceInstWithInst(IfBlock->getTerminator(),
- BranchInst::Create(CondBlock, NewIfBlock, Cmp));
- IfBlock = NewIfBlock;
- }
+ // End if-block.
+ if (IfPredicateStore)
+ PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned),
+ Cmp));
}
}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index 991d027ada5..0d70f557f83 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -1,5 +1,8 @@
-; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL
-; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec < %s | FileCheck %s --check-prefix=VEC
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -simplifycfg < %s | FileCheck %s --check-prefix=VEC
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC
+
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0"
@@ -14,27 +17,49 @@ entry:
; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0
; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true
+; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
;
; VEC: [[cond]]:
-; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
-; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
; VEC: store i32 %[[v13]], i32* %[[v14]], align 4
; VEC: br label %[[else:.+]]
;
; VEC: [[else]]:
; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1
; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true
+; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
;
; VEC: [[cond2]]:
-; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
-; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
; VEC: store i32 %[[v17]], i32* %[[v18]], align 4
; VEC: br label %[[else2:.+]]
;
; VEC: [[else2]]:
+; VEC-IC-LABEL: test
+; VEC-IC: %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
+; VEC-IC: %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
+; VEC-IC: %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0
+; VEC-IC: br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]]
+;
+; VEC-IC: [[cond]]:
+; VEC-IC: %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0
+; VEC-IC: store i32 %[[v4]], i32* %{{.*}}, align 4
+; VEC-IC: br label %[[else:.+]]
+;
+; VEC-IC: [[else]]:
+; VEC-IC: %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1
+; VEC-IC: br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]]
+;
+; VEC-IC: [[cond2]]:
+; VEC-IC: %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1
+; VEC-IC: store i32 %[[v6]], i32* %{{.*}}, align 4
+; VEC-IC: br label %[[else2:.+]]
+;
+; VEC-IC: [[else2]]:
+
; UNROLL-LABEL: test
; UNROLL: vector.body:
; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0
@@ -90,9 +115,9 @@ for.end:
; vectorized loop body.
; PR18724
-; UNROLL-LABEL: bug18724
-; UNROLL: store i32
-; UNROLL: store i32
+; UNROLL-NOSIMPLIFY-LABEL: bug18724
+; UNROLL-NOSIMPLIFY: store i32
+; UNROLL-NOSIMPLIFY: store i32
define void @bug18724() {
entry:
OpenPOWER on IntegriCloud