summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/LoopVectorize/induction.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/Transforms/LoopVectorize/induction.ll')
-rw-r--r--llvm/test/Transforms/LoopVectorize/induction.ll138
1 files changed, 136 insertions, 2 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index b193a5b4a85..beee3978abb 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1,6 +1,7 @@
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -66,6 +67,137 @@ loopexit:
ret void
}
+; Make sure we don't create a vector induction phi node that is unused.
+; Scalarize the step vectors instead.
+;
+; for (int i = 0; i < n; ++i)
+; sum += a[i];
+;
+; IND-LABEL: @scalarize_induction_variable_01(
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND-NOT: add i64 {{.*}}, 2
+; IND: getelementptr inbounds i64, i64* %a, i64 %index
+;
+; UNROLL-LABEL: @scalarize_induction_variable_01(
+; UNROLL: vector.body:
+; UNROLL: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NOT: add i64 {{.*}}, 4
+; UNROLL: %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index
+; UNROLL: getelementptr i64, i64* %[[g1]], i64 2
+
+define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %sum = phi i64 [ %2, %for.body ], [ 0, %entry ]
+ %0 = getelementptr inbounds i64, i64* %a, i64 %i
+ %1 = load i64, i64* %0, align 8
+ %2 = add i64 %1, %sum
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %3 = phi i64 [ %2, %for.body ]
+ ret i64 %3
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors.
+;
+; float s = 0;
+; for (int i ; 0; i < n; i += 8)
+; s += (a[i] + b[i] + 1.0f);
+;
+; IND-LABEL: @scalarize_induction_variable_02(
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %[[i0:.+]] = shl i64 %index, 3
+; IND: %[[i1:.+]] = or i64 %[[i0]], 8
+; IND: getelementptr inbounds float, float* %a, i64 %[[i0]]
+; IND: getelementptr inbounds float, float* %a, i64 %[[i1]]
+;
+; UNROLL-LABEL: @scalarize_induction_variable_02(
+; UNROLL: vector.body:
+; UNROLL: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL: %[[i0:.+]] = shl i64 %index, 3
+; UNROLL: %[[i1:.+]] = or i64 %[[i0]], 8
+; UNROLL: %[[i2:.+]] = or i64 %[[i0]], 16
+; UNROLL: %[[i3:.+]] = or i64 %[[i0]], 24
+; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL: getelementptr inbounds float, float* %a, i64 %[[i3]]
+
+define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+ %s = phi float [ 0.0, %entry ], [ %6, %for.body ]
+ %0 = getelementptr inbounds float, float* %a, i64 %i
+ %1 = load float, float* %0, align 4
+ %2 = getelementptr inbounds float, float* %b, i64 %i
+ %3 = load float, float* %2, align 4
+ %4 = fadd fast float %s, 1.0
+ %5 = fadd fast float %4, %1
+ %6 = fadd fast float %5, %3
+ %i.next = add nuw nsw i64 %i, 8
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ %s.lcssa = phi float [ %6, %for.body ]
+ ret float %s.lcssa
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+; a[i].f ^= y;
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_03(
+; INTERLEAVE: vector.body:
+; INTERLEAVE: %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE: %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE: %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE: %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE: %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE: %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE: %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE: %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1
+; INTERLEAVE: getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1
+
+%pair = type { i32, i32 }
+define void @scalarize_induction_variable_03(%pair *%p, i32 %y, i64 %n) {
+entry:
+ br label %for.body
+
+for.body:
+ %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+ %f = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+ %0 = load i32, i32* %f, align 8
+ %1 = xor i32 %0, %y
+ store i32 %1, i32* %f, align 8
+ %i.next = add nuw nsw i64 %i, 1
+ %cond = icmp slt i64 %i.next, %n
+ br i1 %cond, label %for.body, label %for.end
+
+for.end:
+ ret void
+}
; Make sure that the loop exit count computation does not overflow for i8 and
; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
@@ -114,9 +246,11 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
; CHECK-LABEL: max_i32_backedgetaken
; CHECK: br i1 true, label %scalar.ph, label %min.iters.checked
+; CHECK: middle.block:
+; CHECK: %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
; CHECK: scalar.ph:
-; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ]
-; CHECK: %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ]
+; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
+; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ]
define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
OpenPOWER on IntegriCloud