1 files changed, 136 insertions, 2 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index b193a5b4a85..beee3978abb 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
 ; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -66,6 +67,137 @@ loopexit:
   ret void
 }
 
+; Make sure we don't create a vector induction phi node that is unused.
+; Scalarize the step vectors instead.
+;
+; for (int i = 0; i < n; ++i)
+;   sum += a[i];
+;
+; IND-LABEL: @scalarize_induction_variable_01(
+; IND:     vector.body:
+; IND:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND-NOT:   add i64 {{.*}}, 2
+; IND:       getelementptr inbounds i64, i64* %a, i64 %index
+;
+; UNROLL-LABEL: @scalarize_induction_variable_01(
+; UNROLL:     vector.body:
+; UNROLL:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NOT:   add i64 {{.*}}, 4
+; UNROLL:       %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index
+; UNROLL:       getelementptr i64, i64* %[[g1]], i64 2
+
+define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %sum = phi i64 [ %2, %for.body ], [ 0, %entry ]
+  %0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %1 = load i64, i64* %0, align 8
+  %2 = add i64 %1, %sum
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3  = phi i64 [ %2, %for.body ]
+  ret i64 %3
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors.
+;
+; float s = 0;
+; for (int i ; 0; i < n; i += 8)
+;   s += (a[i] + b[i] + 1.0f);
+;
+; IND-LABEL: @scalarize_induction_variable_02(
+; IND: vector.body:
+; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %[[i0:.+]] = shl i64 %index, 3
+; IND:   %[[i1:.+]] = or i64 %[[i0]], 8
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+;
+; UNROLL-LABEL: @scalarize_induction_variable_02(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %[[i0:.+]] = shl i64 %index, 3
+; UNROLL:   %[[i1:.+]] = or i64 %[[i0]], 8
+; UNROLL:   %[[i2:.+]] = or i64 %[[i0]], 16
+; UNROLL:   %[[i3:.+]] = or i64 %[[i0]], 24
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i3]]
+
+define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %s = phi float [ 0.0, %entry ], [ %6, %for.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %i
+  %1 = load float, float* %0, align 4
+  %2 = getelementptr inbounds float, float* %b, i64 %i
+  %3 = load float, float* %2, align 4
+  %4 = fadd fast float %s, 1.0
+  %5 = fadd fast float %4, %1
+  %6 = fadd fast float %5, %3
+  %i.next = add nuw nsw i64 %i, 8
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %s.lcssa = phi float [ %6, %for.body ]
+  ret float %s.lcssa
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+;   a[i].f ^= y;
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_03(
+; INTERLEAVE: vector.body:
+; INTERLEAVE:   %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE:   %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE:   %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE:   %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE:   %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair, %pair* %p, i64 %[[i7]], i32 1
+
+%pair = type { i32, i32 }
+define void @scalarize_induction_variable_03(%pair *%p, i32 %y, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %f = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i32, i32* %f, align 8
+  %1 = xor i32 %0, %y
+  store i32 %1, i32* %f, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
 ; Make sure that the loop exit count computation does not overflow for i8 and
 ; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
@@ -114,9 +246,11 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK-LABEL: max_i32_backedgetaken
 ; CHECK:  br i1 true, label %scalar.ph, label %min.iters.checked
 
+; CHECK: middle.block:
+; CHECK:  %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
 ; CHECK: scalar.ph:
-; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ]
-; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ]
+; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ 1, %min.iters.checked ], [ %[[v9]], %middle.block ]
 
 define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {