summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp307
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll173
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll90
-rw-r--r--llvm/test/Transforms/LoopVectorize/if-pred-stores.ll31
4 files changed, 501 insertions, 100 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 379607263d7..ca87298ce29 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -386,8 +386,9 @@ protected:
/// See PR14725.
void fixLCSSAPHIs();
- /// Predicate conditional stores on their respective conditions.
- void predicateStores();
+ /// Predicate conditional instructions that require predication on their
+ /// respective conditions.
+ void predicateInstructions();
/// Shrinks vector element sizes based on information in "MinBWs".
void truncateToMinimalBitwidths();
@@ -414,11 +415,11 @@ protected:
void updateAnalysis();
/// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+ /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each
/// scalarized instruction behind an if block predicated on the control
/// dependence of the instruction.
virtual void scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore = false);
+ bool IfPredicateInstr = false);
/// Vectorize Load and Store instructions,
virtual void vectorizeMemoryInstruction(Instruction *Instr);
@@ -624,7 +625,7 @@ protected:
/// Store instructions that should be predicated, as a pair
/// <StoreInst, Predicate>
- SmallVector<std::pair<StoreInst *, Value *>, 4> PredicatedStores;
+ SmallVector<std::pair<Instruction *, Value *>, 4> PredicatedInstructions;
EdgeMaskCache MaskCache;
/// Trip count of the original loop.
Value *TripCount;
@@ -654,7 +655,7 @@ public:
private:
void scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore = false) override;
+ bool IfPredicateInstr = false) override;
void vectorizeMemoryInstruction(Instruction *Instr) override;
Value *getBroadcastInstrs(Value *V) override;
Value *getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -2767,8 +2768,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore) {
+ bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+ DEBUG(dbgs() << "LV: Scalarizing"
+ << (IfPredicateInstr ? " and predicating:" : ":") << *Instr
+ << '\n');
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
@@ -2812,7 +2816,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
VectorParts Cond;
- if (IfPredicateStore) {
+ if (IfPredicateInstr) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
@@ -2826,7 +2830,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
// Start if-block.
Value *Cmp = nullptr;
- if (IfPredicateStore) {
+ if (IfPredicateInstr) {
Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
ConstantInt::get(Cmp->getType(), 1));
@@ -2865,9 +2869,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
// End if-block.
- if (IfPredicateStore)
- PredicatedStores.push_back(
- std::make_pair(cast<StoreInst>(Cloned), Cmp));
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
}
}
}
@@ -3398,9 +3401,13 @@ static Value *addFastMathFlag(Value *V) {
return V;
}
-/// Estimate the overhead of scalarizing a value. Insert and Extract are set if
-/// the result needs to be inserted and/or extracted from vectors.
+/// \brief Estimate the overhead of scalarizing a value based on its type.
+/// Insert and Extract are set if the result needs to be inserted and/or
+/// extracted from vectors.
+/// If the instruction is also to be predicated, add the cost of a PHI
+/// node to the insertion cost.
static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
+ bool Predicated,
const TargetTransformInfo &TTI) {
if (Ty->isVoidTy())
return 0;
@@ -3409,15 +3416,58 @@ static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
unsigned Cost = 0;
for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
- if (Insert)
- Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
if (Extract)
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
+ if (Insert) {
+ Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
+ if (Predicated)
+ Cost += TTI.getCFInstrCost(Instruction::PHI);
+ }
}
+ // We assume that if-converted blocks have a 50% chance of being executed.
+ // Predicated scalarized instructions are avoided due to the CF that bypasses
+ // turned off lanes. The extracts and inserts will be sinked/hoisted to the
+ // predicated basic-block and are subjected to the same assumption.
+ if (Predicated)
+ Cost /= 2;
+
return Cost;
}
+/// \brief Estimate the overhead of scalarizing an Instruction based on the
+/// types of its operands and return value.
+static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
+ Type *RetTy, bool Predicated,
+ const TargetTransformInfo &TTI) {
+ unsigned ScalarizationCost =
+ getScalarizationOverhead(RetTy, true, false, Predicated, TTI);
+
+ for (Type *Ty : OpTys)
+ ScalarizationCost +=
+ getScalarizationOverhead(Ty, false, true, Predicated, TTI);
+
+ return ScalarizationCost;
+}
+
+/// \brief Estimate the overhead of scalarizing an instruction. This is a
+/// convenience wrapper for the type-based getScalarizationOverhead API.
+static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
+ bool Predicated,
+ const TargetTransformInfo &TTI) {
+ if (VF == 1)
+ return 0;
+
+ Type *RetTy = ToVectorTy(I->getType(), VF);
+
+ SmallVector<Type *, 4> OpTys;
+ unsigned OperandsNum = I->getNumOperands();
+ for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
+ OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
+
+ return getScalarizationOverhead(OpTys, RetTy, Predicated, TTI);
+}
+
// Estimate cost of a call instruction CI if it were vectorized with factor VF.
// Return the cost of the instruction, including scalarization overhead if it's
// needed. The flag NeedToScalarize shows if the call needs to be scalarized -
@@ -3448,10 +3498,7 @@ static unsigned getVectorCallCost(CallInst *CI, unsigned VF,
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- unsigned ScalarizationCost =
- getScalarizationOverhead(RetTy, true, false, TTI);
- for (Type *Ty : Tys)
- ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
+ unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI);
unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
@@ -3871,7 +3918,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
// Make sure DomTree is updated.
updateAnalysis();
- predicateStores();
+ predicateInstructions();
// Remove redundant induction instructions.
cse(LoopVectorBody);
@@ -4038,17 +4085,128 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
LoopMiddleBlock);
}
}
-
-void InnerLoopVectorizer::predicateStores() {
- for (auto KV : PredicatedStores) {
+
+void InnerLoopVectorizer::predicateInstructions() {
+
+ // For each instruction I marked for predication on value C, split I into its
+ // own basic block to form an if-then construct over C.
+ // Since I may be fed by extractelement and/or be feeding an insertelement
+ // generated during scalarization we try to move such instructions into the
+ // predicated basic block as well. For the insertelement this also means that
+ // the PHI will be created for the resulting vector rather than for the
+ // scalar instruction.
+ // So for some predicated instruction, e.g. the conditional sdiv in:
+ //
+ // for.body:
+ // ...
+ // %add = add nsw i32 %mul, %0
+ // %cmp5 = icmp sgt i32 %2, 7
+ // br i1 %cmp5, label %if.then, label %if.end
+ //
+ // if.then:
+ // %div = sdiv i32 %0, %1
+ // br label %if.end
+ //
+ // if.end:
+ // %x.0 = phi i32 [ %div, %if.then ], [ %add, %for.body ]
+ //
+ // the sdiv at this point is scalarized and if-converted using a select.
+ // The inactive elements in the vector are not used, but the predicated
+ // instruction is still executed for all vector elements, essentially:
+ //
+ // vector.body:
+ // ...
+ // %17 = add nsw <2 x i32> %16, %wide.load
+ // %29 = extractelement <2 x i32> %wide.load, i32 0
+ // %30 = extractelement <2 x i32> %wide.load51, i32 0
+ // %31 = sdiv i32 %29, %30
+ // %32 = insertelement <2 x i32> undef, i32 %31, i32 0
+ // %35 = extractelement <2 x i32> %wide.load, i32 1
+ // %36 = extractelement <2 x i32> %wide.load51, i32 1
+ // %37 = sdiv i32 %35, %36
+ // %38 = insertelement <2 x i32> %32, i32 %37, i32 1
+ // %predphi = select <2 x i1> %26, <2 x i32> %38, <2 x i32> %17
+ //
+ // Predication will now re-introduce the original control flow to avoid false
+ // side-effects by the sdiv instructions on the inactive elements, yielding
+ // (after cleanup):
+ //
+ // vector.body:
+ // ...
+ // %5 = add nsw <2 x i32> %4, %wide.load
+ // %8 = icmp sgt <2 x i32> %wide.load52, <i32 7, i32 7>
+ // %9 = extractelement <2 x i1> %8, i32 0
+ // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue
+ //
+ // pred.sdiv.if:
+ // %10 = extractelement <2 x i32> %wide.load, i32 0
+ // %11 = extractelement <2 x i32> %wide.load51, i32 0
+ // %12 = sdiv i32 %10, %11
+ // %13 = insertelement <2 x i32> undef, i32 %12, i32 0
+ // br label %pred.sdiv.continue
+ //
+ // pred.sdiv.continue:
+ // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ]
+ // %15 = extractelement <2 x i1> %8, i32 1
+ // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55
+ //
+ // pred.sdiv.if54:
+ // %16 = extractelement <2 x i32> %wide.load, i32 1
+ // %17 = extractelement <2 x i32> %wide.load51, i32 1
+ // %18 = sdiv i32 %16, %17
+ // %19 = insertelement <2 x i32> %14, i32 %18, i32 1
+ // br label %pred.sdiv.continue55
+ //
+ // pred.sdiv.continue55:
+ // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ]
+ // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5
+
+ for (auto KV : PredicatedInstructions) {
BasicBlock::iterator I(KV.first);
- auto *BB = SplitBlock(I->getParent(), &*std::next(I), DT, LI);
+ BasicBlock *Head = I->getParent();
+ auto *BB = SplitBlock(Head, &*std::next(I), DT, LI);
auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false,
/*BranchWeights=*/nullptr, DT, LI);
I->moveBefore(T);
- I->getParent()->setName("pred.store.if");
- BB->setName("pred.store.continue");
+ // Try to move any extractelement we may have created for the predicated
+ // instruction into the Then block.
+ for (Use &Op : I->operands()) {
+ auto *OpInst = dyn_cast<ExtractElementInst>(&*Op);
+ if (OpInst && OpInst->hasOneUse()) // TODO: more accurately - hasOneUser()
+ OpInst->moveBefore(&*I);
+ }
+
+ I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if");
+ BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue");
+
+ // If the instruction is non-void create a Phi node at reconvergence point.
+ if (!I->getType()->isVoidTy()) {
+ Value *IncomingTrue = nullptr;
+ Value *IncomingFalse = nullptr;
+
+ if (I->hasOneUse() && isa<InsertElementInst>(*I->user_begin())) {
+ // If the predicated instruction is feeding an insert-element, move it
+ // into the Then block; Phi node will be created for the vector.
+ InsertElementInst *IEI = cast<InsertElementInst>(*I->user_begin());
+ IEI->moveBefore(T);
+ IncomingTrue = IEI; // the new vector with the inserted element.
+ IncomingFalse = IEI->getOperand(0); // the unmodified vector
+ } else {
+ // Phi node will be created for the scalar predicated instruction.
+ IncomingTrue = &*I;
+ IncomingFalse = UndefValue::get(I->getType());
+ }
+
+ BasicBlock *PostDom = I->getParent()->getSingleSuccessor();
+ assert(PostDom && "Then block has multiple successors");
+ PHINode *Phi =
+ PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front());
+ IncomingTrue->replaceAllUsesWith(Phi);
+ Phi->addIncoming(IncomingFalse, Head);
+ Phi->addIncoming(IncomingTrue, I->getParent());
+ }
}
+
DEBUG(DT->verifyDomTree());
}
@@ -4235,6 +4393,24 @@ void InnerLoopVectorizer::widenPHIInstruction(
}
}
+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+ assert((I.getOpcode() == Instruction::UDiv ||
+ I.getOpcode() == Instruction::SDiv ||
+ I.getOpcode() == Instruction::URem ||
+ I.getOpcode() == Instruction::SRem) &&
+ "Unexpected instruction");
+ Value *Divisor = I.getOperand(1);
+ auto *CInt = dyn_cast<ConstantInt>(Divisor);
+ return !CInt || CInt->isZero();
+}
+
void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// For each instruction in the old loop.
for (Instruction &I : *BB) {
@@ -4251,17 +4427,23 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
continue;
} // End of PHI.
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ // Scalarize with predication if this instruction may divide by zero and
+ // block execution is conditional, otherwise fallthrough.
+ if (mayDivideByZero(I) && Legal->blockNeedsPredication(I.getParent())) {
+ scalarizeInstruction(&I, true);
+ continue;
+ }
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
@@ -5155,17 +5337,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(
}
if (I.mayThrow())
return false;
-
- // The instructions below can trap.
- switch (I.getOpcode()) {
- default:
- continue;
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::URem:
- case Instruction::SRem:
- return false;
- }
}
return true;
@@ -6082,17 +6253,24 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
// TODO: IF-converted IFs become selects.
return 0;
}
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ // We assume that if-converted blocks have a 50% chance of being executed.
+ // Predicated scalarized instructions are avoided due to the CF that
+ // bypasses turned off lanes. If we are not predicating, fallthrough.
+ if (VF > 1 && mayDivideByZero(*I) &&
+ Legal->blockNeedsPredication(I->getParent()))
+ return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) / 2 +
+ getScalarizationOverhead(I, VF, true, TTI);
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
@@ -6328,28 +6506,11 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI));
return CallCost;
}
- default: {
- // We are scalarizing the instruction. Return the cost of the scalar
- // instruction, plus the cost of insert and extract into vector
- // elements, times the vector width.
- unsigned Cost = 0;
-
- if (!RetTy->isVoidTy() && VF != 1) {
- unsigned InsCost =
- TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy);
- unsigned ExtCost =
- TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy);
-
- // The cost of inserting the results plus extracting each one of the
- // operands.
- Cost += VF * (InsCost + ExtCost * I->getNumOperands());
- }
-
+ default:
// The cost of executing VF copies of the scalar instruction. This opcode
// is unknown. Assume that it is the same as 'mul'.
- Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
- return Cost;
- }
+ return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+ getScalarizationOverhead(I, VF, false, TTI);
} // end of switch.
}
@@ -6407,7 +6568,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
}
void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
- bool IfPredicateStore) {
+ bool IfPredicateInstr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
@@ -6450,7 +6611,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
VectorParts Cond;
- if (IfPredicateStore) {
+ if (IfPredicateInstr) {
assert(Instr->getParent()->getSinglePredecessor() &&
"Only support single predecessor blocks");
Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
@@ -6463,7 +6624,7 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
// Start an "if (pred) a[i] = ..." block.
Value *Cmp = nullptr;
- if (IfPredicateStore) {
+ if (IfPredicateInstr) {
if (Cond[Part]->getType()->isVectorTy())
Cond[Part] =
Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
@@ -6494,16 +6655,16 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
VecResults[Part] = Cloned;
// End if-block.
- if (IfPredicateStore)
- PredicatedStores.push_back(std::make_pair(cast<StoreInst>(Cloned), Cmp));
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
}
}
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
auto *SI = dyn_cast<StoreInst>(Instr);
- bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+ bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent()));
- return scalarizeInstruction(Instr, IfPredicateStore);
+ return scalarizeInstruction(Instr, IfPredicateInstr);
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
new file mode 100644
index 00000000000..881eb51f9bc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -0,0 +1,173 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test predication of non-void instructions, specifically (i) that these
+; instructions permit vectorization and (ii) the creation of an insertelement
+; and a Phi node. We check the full 2-element sequence for the first
+; instruction; For the rest we'll just make sure they get predicated based
+; on the code generated for the first element.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+ i32* nocapture %asr, i32* nocapture %aur) {
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %if.end
+ ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK: %[[SDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK: %[[SDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEE]], true
+; CHECK: br i1 %[[SDCC]], label %[[CSD:[a-zA-Z0-9.]+]], label %[[ESD:[a-zA-Z0-9.]+]]
+; CHECK: [[CSD]]:
+; CHECK: %[[SDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]]
+; CHECK: %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0
+; CHECK: br label %[[ESD]]
+; CHECK: [[ESD]]:
+; CHECK: %[[SDR:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[SD1]], %[[CSD]] ]
+; CHECK: %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1
+; CHECK: %[[SDCCH:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEEH]], true
+; CHECK: br i1 %[[SDCCH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]]
+; CHECK: [[CSDH]]:
+; CHECK: %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK: %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK: %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]]
+; CHECK: %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1
+; CHECK: br label %[[ESDH]]
+; CHECK: [[ESDH]]:
+; CHECK: %{{.*}} = phi <2 x i32> [ %[[SDR]], %[[ESD]] ], [ %[[SD1H]], %[[CSDH]] ]
+
+; CHECK: %[[UDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK: %[[UDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UDEE]], true
+; CHECK: br i1 %[[UDCC]], label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]]
+; CHECK: [[CUD]]:
+; CHECK: %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]]
+; CHECK: %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0
+; CHECK: br label %[[EUD]]
+; CHECK: [[EUD]]:
+; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UD1]], %[[CUD]] ]
+
+; CHECK: %[[SREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK: %[[SRCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SREE]], true
+; CHECK: br i1 %[[SRCC]], label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]]
+; CHECK: [[CSR]]:
+; CHECK: %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]]
+; CHECK: %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0
+; CHECK: br label %[[ESR]]
+; CHECK: [[ESR]]:
+; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[SR1]], %[[CSR]] ]
+
+; CHECK: %[[UREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK: %[[URCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UREE]], true
+; CHECK: br i1 %[[URCC]], label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]]
+; CHECK: [[CUR]]:
+; CHECK: %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK: %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]]
+; CHECK: %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0
+; CHECK: br label %[[EUR]]
+; CHECK: [[EUR]]:
+; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UR1]], %[[CUR]] ]
+
+for.body: ; preds = %if.end, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+ %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+ %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+ %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+ %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+ %lsd = load i32, i32* %isd, align 4
+ %lud = load i32, i32* %iud, align 4
+ %lsr = load i32, i32* %isr, align 4
+ %lur = load i32, i32* %iur, align 4
+ %psd = add nsw i32 %lsd, 23
+ %pud = add nsw i32 %lud, 24
+ %psr = add nsw i32 %lsr, 25
+ %pur = add nsw i32 %lur, 26
+ %cmp1 = icmp slt i32 %lsd, 100
+ br i1 %cmp1, label %if.then, label %if.end
+
+if.then: ; preds = %for.body
+ %rsd = sdiv i32 %psd, %lsd
+ %rud = udiv i32 %pud, %lud
+ %rsr = srem i32 %psr, %lsr
+ %rur = urem i32 %pur, %lur
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+ %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+ %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+ %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+ store i32 %ysd.0, i32* %isd, align 4
+ store i32 %yud.0, i32* %iud, align 4
+ store i32 %ysr.0, i32* %isr, align 4
+ store i32 %yur.0, i32* %iur, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 128
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Future-use test for predication under smarter scalar-scalar: this test will
+; fail when the vectorizer starts feeding scalarized values directly to their
+; scalar users, i.e. w/o generating redundant insertelement/extractelement
+; instructions. This case is already supported by the predication code (which
+; should generate a phi for the scalar predicated value rather than for the
+; insertelement), but cannot be tested yet.
+; If you got this test to fail, kindly fix the test by using the alternative
+; FFU sequence. This will make the test check how we handle this case from
+; now on.
+define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %if.end
+ ret void
+
+; CHECK-LABEL: test_scalar2scalar
+; CHECK: vector.body:
+; CHECK: br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; CHECK: [[THEN]]:
+; CHECK: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK: %[[PDV:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[PD]], i32 0
+; CHECK: br label %[[FI]]
+; CHECK: [[FI]]:
+; CHECK: %[[PH:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[PDV]], %[[THEN]] ]
+; FFU-LABEL: test_scalar2scalar
+; FFU: vector.body:
+; FFU: br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; FFU: [[THEN]]:
+; FFU: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; FFU: br label %[[FI]]
+; FFU: [[FI]]:
+; FFU: %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ]
+
+for.body: ; preds = %if.end, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+ %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+ %lsd = load i32, i32* %isd, align 4
+ %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
+ %lsd.b = load i32, i32* %isd.b, align 4
+ %psd = add nsw i32 %lsd, 23
+ %cmp1 = icmp slt i32 %lsd, 100
+ br i1 %cmp1, label %if.then, label %if.end
+
+if.then: ; preds = %for.body
+ %sd1 = sdiv i32 %psd, %lsd
+ %rsd = sdiv i32 %lsd.b, %sd1
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+ store i32 %ysd.0, i32* %isd, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 128
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll b/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
new file mode 100644
index 00000000000..a7d4eba7332
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
@@ -0,0 +1,90 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test no-predication of instructions that are provably safe, e.g. dividing by
+; a non-zero constant.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+ i32* nocapture %asr, i32* nocapture %aur,
+ i32* nocapture %asd0, i32* nocapture %aud0,
+ i32* nocapture %asr0, i32* nocapture %aur0
+) {
+entry:
+ br label %for.body
+
+for.cond.cleanup: ; preds = %if.end
+ ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 11, i32 11>
+; CHECK: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 13, i32 13>
+; CHECK: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 17, i32 17>
+; CHECK: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 19, i32 19>
+; CHECK-NOT: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 0, i32 0>
+
+for.body: ; preds = %if.end, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+ %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+ %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+ %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+ %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+ %lsd = load i32, i32* %isd, align 4
+ %lud = load i32, i32* %iud, align 4
+ %lsr = load i32, i32* %isr, align 4
+ %lur = load i32, i32* %iur, align 4
+ %psd = add nsw i32 %lsd, 23
+ %pud = add nsw i32 %lud, 24
+ %psr = add nsw i32 %lsr, 25
+ %pur = add nsw i32 %lur, 26
+ %isd0 = getelementptr inbounds i32, i32* %asd0, i64 %indvars.iv
+ %iud0 = getelementptr inbounds i32, i32* %aud0, i64 %indvars.iv
+ %isr0 = getelementptr inbounds i32, i32* %asr0, i64 %indvars.iv
+ %iur0 = getelementptr inbounds i32, i32* %aur0, i64 %indvars.iv
+ %lsd0 = load i32, i32* %isd0, align 4
+ %lud0 = load i32, i32* %iud0, align 4
+ %lsr0 = load i32, i32* %isr0, align 4
+ %lur0 = load i32, i32* %iur0, align 4
+ %psd0 = add nsw i32 %lsd, 27
+ %pud0 = add nsw i32 %lud, 28
+ %psr0 = add nsw i32 %lsr, 29
+ %pur0 = add nsw i32 %lur, 30
+ %cmp1 = icmp slt i32 %lsd, 100
+ br i1 %cmp1, label %if.then, label %if.end
+
+if.then: ; preds = %for.body
+ %rsd = sdiv i32 %psd, 11
+ %rud = udiv i32 %pud, 13
+ %rsr = srem i32 %psr, 17
+ %rur = urem i32 %pur, 19
+ %rsd0 = sdiv i32 %psd0, 0
+ %rud0 = udiv i32 %pud0, 0
+ %rsr0 = srem i32 %psr0, 0
+ %rur0 = urem i32 %pur0, 0
+ br label %if.end
+
+if.end: ; preds = %if.then, %for.body
+ %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+ %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+ %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+ %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+ %ysd0.0 = phi i32 [ %rsd0, %if.then ], [ %psd0, %for.body ]
+ %yud0.0 = phi i32 [ %rud0, %if.then ], [ %pud0, %for.body ]
+ %ysr0.0 = phi i32 [ %rsr0, %if.then ], [ %psr0, %for.body ]
+ %yur0.0 = phi i32 [ %rur0, %if.then ], [ %pur0, %for.body ]
+ store i32 %ysd.0, i32* %isd, align 4
+ store i32 %yud.0, i32* %iud, align 4
+ store i32 %ysr.0, i32* %isr, align 4
+ store i32 %yur.0, i32* %iur, align 4
+ store i32 %ysd0.0, i32* %isd0, align 4
+ store i32 %yud0.0, i32* %iud0, align 4
+ store i32 %ysr0.0, i32* %isr0, align 4
+ store i32 %yur0.0, i32* %iur0, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 128
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index f39e774bb89..037f3476658 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -1,7 +1,6 @@
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=VEC
-; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -enable-cond-stores-vec -verify-loop-info -simplifycfg -instcombine < %s | FileCheck %s --check-prefix=VEC-IC
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0"
@@ -17,49 +16,27 @@ entry:
; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
; VEC: %[[v11:.+]] = extractelement <2 x i1> %[[v10]], i32 0
; VEC: %[[v12:.+]] = icmp eq i1 %[[v11]], true
-; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
-; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]]
;
; VEC: [[cond]]:
+; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0
+; VEC: %[[v14:.+]] = extractelement <2 x i32*> %{{.*}}, i32 0
; VEC: store i32 %[[v13]], i32* %[[v14]], align 4
; VEC: br label %[[else:.+]]
;
; VEC: [[else]]:
; VEC: %[[v15:.+]] = extractelement <2 x i1> %[[v10]], i32 1
; VEC: %[[v16:.+]] = icmp eq i1 %[[v15]], true
-; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
-; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]]
;
; VEC: [[cond2]]:
+; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1
+; VEC: %[[v18:.+]] = extractelement <2 x i32*> %{{.+}} i32 1
; VEC: store i32 %[[v17]], i32* %[[v18]], align 4
; VEC: br label %[[else2:.+]]
;
; VEC: [[else2]]:
-; VEC-IC-LABEL: test
-; VEC-IC: %[[v1:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
-; VEC-IC: %[[v2:.+]] = add nsw <2 x i32> %{{.*}}, <i32 20, i32 20>
-; VEC-IC: %[[v3:.+]] = extractelement <2 x i1> %[[v1]], i32 0
-; VEC-IC: br i1 %[[v3]], label %[[cond:.+]], label %[[else:.+]]
-;
-; VEC-IC: [[cond]]:
-; VEC-IC: %[[v4:.+]] = extractelement <2 x i32> %[[v2]], i32 0
-; VEC-IC: store i32 %[[v4]], i32* %{{.*}}, align 4
-; VEC-IC: br label %[[else:.+]]
-;
-; VEC-IC: [[else]]:
-; VEC-IC: %[[v5:.+]] = extractelement <2 x i1> %[[v1]], i32 1
-; VEC-IC: br i1 %[[v5]], label %[[cond2:.+]], label %[[else2:.+]]
-;
-; VEC-IC: [[cond2]]:
-; VEC-IC: %[[v6:.+]] = extractelement <2 x i32> %[[v2]], i32 1
-; VEC-IC: store i32 %[[v6]], i32* %{{.*}}, align 4
-; VEC-IC: br label %[[else2:.+]]
-;
-; VEC-IC: [[else2]]:
-
; UNROLL-LABEL: test
; UNROLL: vector.body:
; UNROLL: %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0
OpenPOWER on IntegriCloud