3 files changed, 69 insertions, 16 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 00a30b704cd..62b1339138f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2377,6 +2377,16 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
       return VectorLoopValueMap.initVector(V, Entry);
     }
 
+    // Get the last scalarized instruction. This corresponds to the instruction
+    // we created for the last vector lane on the last unroll iteration.
+    auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, VF - 1));
+
+    // Set the insert point after the last scalarized instruction. This ensures
+    // the insertelement sequence will directly follow the scalar definitions.
+    auto OldIP = Builder.saveIP();
+    auto NewIP = std::next(BasicBlock::iterator(LastInst));
+    Builder.SetInsertPoint(&*NewIP);
+
     // However, if we are vectorizing, we need to construct the vector values
     // using insertelement instructions. Since the resulting vectors are stored
     // in VectorLoopValueMap, we will only generate the insertelements once.
@@ -2387,6 +2397,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
             Insert, getScalarValue(V, Part, Width), Builder.getInt32(Width));
       Entry[Part] = Insert;
     }
+    Builder.restoreIP(OldIP);
     return VectorLoopValueMap.initVector(V, Entry);
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index 5129568075f..fc68adb59df 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
@@ -297,3 +298,44 @@ for.cond.cleanup3:
   %exitcond = icmp eq i32 %inc, 49
   br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
 }
+
+; CHECK-LABEL: @PR29559
+;
+; UNROLL-NO-IC: vector.ph:
+; UNROLL-NO-IC:   br label %vector.body
+;
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NO-IC:   %vector.recur = phi <4 x float*> [ undef, %vector.ph ], [ %[[I4:.+]], %vector.body ]
+; UNROLL-NO-IC:   %[[G1:.+]] = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
+; UNROLL-NO-IC:   %[[I1:.+]] = insertelement <4 x float*> undef, float* %[[G1]], i32 0
+; UNROLL-NO-IC:   %[[I2:.+]] = insertelement <4 x float*> %[[I1]], float* %[[G1]], i32 1
+; UNROLL-NO-IC:   %[[I3:.+]] = insertelement <4 x float*> %[[I2]], float* %[[G1]], i32 2
+; UNROLL-NO-IC:   %[[I4]] = insertelement <4 x float*> %[[I3]], float* %[[G1]], i32 3
+; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %vector.recur, <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %[[I4]], <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+;
+; UNROLL-NO-IC: middle.block:
+; UNROLL-NO-IC:   %vector.recur.extract = extractelement <4 x float*> %[[I4]], i32 3
+;
+; UNROLL-NO-IC: scalar.ph:
+; UNROLL-NO-IC:   %scalar.recur.init = phi float* [ %vector.recur.extract, %middle.block ], [ undef, %min.iters.checked ], [ undef, %entry ]
+;
+; UNROLL-NO-IC: scalar.body:
+; UNROLL-NO-IC:   %scalar.recur = phi float* [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+define void @PR29559() {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp2 = phi float* [ undef, %entry ], [ %tmp3, %scalar.body ]
+  %tmp3 = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, undef
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 5f165c52efc..ec67e632efb 100755
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -54,6 +54,22 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]]
 ; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]]
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10
+; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13
+; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14
+; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15
 ; CHECK-NEXT:    [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0
 ; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0
@@ -87,22 +103,6 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0
 ; CHECK-NEXT:    [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15
 ; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15
 ; CHECK-NEXT:    [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK:         [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>