summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp96
-rw-r--r--llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll6
-rw-r--r--llvm/test/Transforms/LoopVectorize/cast-induction.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/gcc-examples.ll2
-rw-r--r--llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll11
-rw-r--r--llvm/test/Transforms/LoopVectorize/global_alias.ll8
-rw-r--r--llvm/test/Transforms/LoopVectorize/induction.ll65
-rw-r--r--llvm/test/Transforms/LoopVectorize/induction_plus.ll9
9 files changed, 161 insertions, 40 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 584cc83c337..144eab58f74 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -422,6 +422,14 @@ protected:
/// from SCEV or creates a new using SCEVExpander.
virtual Value *getStepVector(Value *Val, int StartIdx, const SCEV *Step);
+ /// Create a vector induction variable based on an existing scalar one.
+ /// Currently only works for integer primary induction variables with
+ /// a constant step.
+ /// If TruncType is provided, instead of widening the original IV, we
+ /// widen a version of the IV truncated to TruncType.
+ void widenInductionVariable(const InductionDescriptor &II, VectorParts &Entry,
+ IntegerType *TruncType = nullptr);
+
/// When we go over instructions in the basic block we rely on previous
/// values within the current basic block or on loop invariant values.
/// When we widen (vectorize) values we place them in the map. If the values
@@ -2099,6 +2107,40 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
return getStepVector(Val, StartIdx, StepValue);
}
+void InnerLoopVectorizer::widenInductionVariable(const InductionDescriptor &II,
+ VectorParts &Entry,
+ IntegerType *TruncType) {
+ Value *Start = II.getStartValue();
+ ConstantInt *Step = II.getConstIntStepValue();
+ assert(Step && "Can not widen an IV with a non-constant step");
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ if (TruncType) {
+ Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+ Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+ Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+ Builder.restoreIP(CurrIP);
+
+ Value *SplatVF =
+ ConstantVector::getSplat(VF, ConstantInt::get(Start->getType(), VF));
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+ &*LoopVectorBody->getFirstInsertionPt());
+ Value *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Entry[Part] = LastInduction;
+ LastInduction = Builder.CreateAdd(LastInduction, SplatVF, "step.add");
+ }
+
+ VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+ VecInd->addIncoming(LastInduction, LoopVectorBody);
+}
+
Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
Value *Step) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
@@ -4056,19 +4098,25 @@ void InnerLoopVectorizer::widenPHIInstruction(
llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction: {
assert(P->getType() == II.getStartValue()->getType() && "Types must match");
- // Handle other induction variables that are now based on the
- // canonical one.
- Value *V = Induction;
- if (P != OldInduction) {
- V = Builder.CreateSExtOrTrunc(Induction, P->getType());
- V = II.transform(Builder, V, PSE.getSE(), DL);
- V->setName("offset.idx");
+ if (P != OldInduction || VF == 1) {
+ Value *V = Induction;
+ // Handle other induction variables that are now based on the
+ // canonical one.
+ if (P != OldInduction) {
+ V = Builder.CreateSExtOrTrunc(Induction, P->getType());
+ V = II.transform(Builder, V, PSE.getSE(), DL);
+ V->setName("offset.idx");
+ }
+ Value *Broadcasted = getBroadcastInstrs(V);
+ // After broadcasting the induction variable we need to make the vector
+ // consecutive by adding 0, 1, 2, etc.
+ for (unsigned part = 0; part < UF; ++part)
+ Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
+ } else {
+ // Instead of re-creating the vector IV by splatting the scalar IV
+ // in each iteration, we can make a new independent vector IV.
+ widenInductionVariable(II, Entry);
}
- Value *Broadcasted = getBroadcastInstrs(V);
- // After broadcasting the induction variable we need to make the vector
- // consecutive by adding 0, 1, 2, etc.
- for (unsigned part = 0; part < UF; ++part)
- Entry[part] = getStepVector(Broadcasted, VF * part, II.getStep());
return;
}
case InductionDescriptor::IK_PtrInduction:
@@ -4239,15 +4287,23 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
if (CI->getOperand(0) == OldInduction &&
it->getOpcode() == Instruction::Trunc) {
InductionDescriptor II =
- Legal->getInductionVars()->lookup(OldInduction);
+ Legal->getInductionVars()->lookup(OldInduction);
if (auto StepValue = II.getConstIntStepValue()) {
- StepValue = ConstantInt::getSigned(cast<IntegerType>(CI->getType()),
- StepValue->getSExtValue());
- Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
- CI->getType());
- Value *Broadcasted = getBroadcastInstrs(ScalarCast);
- for (unsigned Part = 0; Part < UF; ++Part)
- Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
+ IntegerType *TruncType = cast<IntegerType>(CI->getType());
+ if (VF == 1) {
+ StepValue =
+ ConstantInt::getSigned(TruncType, StepValue->getSExtValue());
+ Value *ScalarCast =
+ Builder.CreateCast(CI->getOpcode(), Induction, CI->getType());
+ Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+ for (unsigned Part = 0; Part < UF; ++Part)
+ Entry[Part] = getStepVector(Broadcasted, VF * Part, StepValue);
+ } else {
+ // Truncating a vector induction variable on each iteration
+ // may be expensive. Instead, truncate the initial value, and create
+ // a new, truncated, vector IV based on that.
+ widenInductionVariable(II, Entry, TruncType);
+ }
addMetadata(Entry, &*it);
break;
}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
index 65b3919585e..12a16018f58 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
@@ -43,7 +43,7 @@ for.end12: ; preds = %for.end, %entry
; CHECK-LABEL: @s173
; CHECK: load <4 x float>, <4 x float>*
-; CHECK: add i64 %index, 16000
+; CHECK: add nsw i64 %.lhs, 16000
; CHECK: ret i32 0
}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index 2f1de54d5f9..23e363eae02 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -95,7 +95,7 @@ for.end: ; preds = %for.cond
%struct.In = type { float, float }
;AVX512-LABEL: @foo2
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: llvm.masked.store.v16f32
;AVX512: ret void
@@ -170,10 +170,10 @@ for.end: ; preds = %for.cond
;}
;AVX512-LABEL: @foo3
-;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: fadd <16 x float>
-;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1
+;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %{{.*}}, i32 1
;AVX512: llvm.masked.scatter.v16f32
;AVX512: ret void
diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
index fae89976a7b..54f68b7bd07 100644
--- a/llvm/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.8.0"
@a = common global [2048 x i32] zeroinitializer, align 16
;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
diff --git a/llvm/test/Transforms/LoopVectorize/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
index 18809018615..95b0d16d57f 100644
--- a/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -368,7 +368,7 @@ define void @example11() nounwind uwtable ssp {
}
;CHECK-LABEL: @example12(
-;CHECK: trunc i64
+;CHECK: %vec.ind1 = phi <4 x i32>
;CHECK: store <4 x i32>
;CHECK: ret void
define void @example12() nounwind uwtable ssp {
diff --git a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
index ab2fd5e4e1c..fb12e172f54 100644
--- a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
+++ b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
@@ -12,10 +12,11 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; CHECK-LABEL: @foo
; CHECK: vector.body
-; CHECK: %0 = getelementptr inbounds double*, double** %in, i64 %index
-; CHECK: %1 = bitcast double** %0 to <4 x i64>*
-; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %1, align 8
-; CHECK: %2 = icmp eq <4 x i64> %wide.load, zeroinitializer
+; CHECK: %0 = phi
+; CHECK: %2 = getelementptr inbounds double*, double** %in, i64 %0
+; CHECK: %3 = bitcast double** %2 to <4 x i64>*
+; CHECK: %wide.load = load <4 x i64>, <4 x i64>* %3, align 8
+; CHECK: %4 = icmp eq <4 x i64> %wide.load, zeroinitializer
; CHECK: br i1
define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {
@@ -37,4 +38,4 @@ for.body:
for.end:
ret void
-} \ No newline at end of file
+}
diff --git a/llvm/test/Transforms/LoopVectorize/global_alias.ll b/llvm/test/Transforms/LoopVectorize/global_alias.ll
index 84fa48cd514..0da841bcbbd 100644
--- a/llvm/test/Transforms/LoopVectorize/global_alias.ll
+++ b/llvm/test/Transforms/LoopVectorize/global_alias.ll
@@ -12,7 +12,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
@PA = external global i32*
-;; === First, the tests that should always vectorize, wither statically or by adding run-time checks ===
+;; === First, the tests that should always vectorize, whether statically or by adding run-time checks ===
; /// Different objects, positive induction, constant distance
@@ -387,7 +387,7 @@ for.end: ; preds = %for.cond
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias08(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias08(i32 %a) #0 {
@@ -439,7 +439,7 @@ for.end: ; preds = %for.cond
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias09(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias09(i32 %a) #0 {
@@ -721,7 +721,7 @@ for.end: ; preds = %for.cond
; return Foo.A[a];
; }
; CHECK-LABEL: define i32 @noAlias14(
-; CHECK: sub <4 x i32>
+; CHECK: sub nuw nsw <4 x i32>
; CHECK: ret
define i32 @noAlias14(i32 %a) #0 {
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 8e3cf365e83..c2d4d96153a 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1,4 +1,6 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -27,8 +29,6 @@ for.end:
ret void
}
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
-
; Make sure we remove unneeded vectorization of induction variables.
; In order for instcombine to cleanup the vectorized induction variables that we
; create in the loop vectorizer we need to perform some form of redundancy
@@ -241,3 +241,64 @@ entry:
exit:
ret void
}
+
+; Check that we generate vectorized IVs in the pre-header
+; instead of widening the scalar IV inside the loop, when
+; we know how to do that.
+; IND-LABEL: veciv
+; IND: vector.body:
+; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add, %vector.body ]
+; IND: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND: %index.next = add i32 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i32 %index.next
+; IND: br i1 %[[CMP]]
+; UNROLL-LABEL: veciv
+; UNROLL: vector.body:
+; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %step.add1, %vector.body ]
+; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL: %step.add1 = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL: %index.next = add i32 %index, 4
+; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
+; UNROLL: br i1 %[[CMP]]
+define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+ store i32 %indvars.iv, i32* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+ %exitcond = icmp eq i32 %indvars.iv.next, %k
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+; IND-LABEL: trunciv
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %[[VECIND:.*]] = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %[[STEPADD:.*]], %vector.body ]
+; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], <i32 2, i32 2>
+; IND: %index.next = add i64 %index, 2
+; IND: %[[CMP:.*]] = icmp eq i64 %index.next
+; IND: br i1 %[[CMP]]
+define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
+for.body.preheader:
+ br label %for.body
+
+for.body:
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+ %trunc.iv = trunc i64 %indvars.iv to i32
+ %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
+ store i32 %trunc.iv, i32* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, %k
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/induction_plus.ll b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
index 7c4c8f2edcb..5e96d4196ca 100644
--- a/llvm/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
@@ -6,8 +6,11 @@ target triple = "x86_64-apple-macosx10.8.0"
@array = common global [1024 x i32] zeroinitializer, align 16
;CHECK-LABEL: @array_at_plus_one(
-;CHECK: add i64 %index, 12
-;CHECK: trunc i64
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %step.add, %vector.body ]
+;CHECK: %vec.ind1 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %step.add2, %vector.body ]
+;CHECK: add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+;CHECK: add nsw <4 x i64> %vec.ind, <i64 12, i64 12, i64 12, i64 12>
;CHECK: ret i32
define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
%1 = icmp sgt i32 %n, 0
OpenPOWER on IntegriCloud