314 files changed, 42557 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
new file mode 100644
index 00000000000..4766794ab45
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @foo(
+;CHECK: icmp eq <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: ret i32
+define i32 @foo(i32 %x, i32 %t, i32* nocapture %A) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i32 %x, 0
+  br i1 %cmp10, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %if.end
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %1 = add nsw i64 %indvars.iv, 45
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %t
+  %3 = trunc i64 %1 to i32
+  %add1 = add nsw i32 %3, %mul
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ]
+  store i32 %z.0, i32* %arrayidx, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %x
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end, %entry
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
new file mode 100644
index 00000000000..b3eae690423
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce
+
+; Check that we don't fall into an infinite loop.
+define void @test() nounwind {
+entry:
+ br label %for.body
+
+for.body:
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ br label %for.body
+}
+
+
+
+define void @test2() nounwind {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ %indvars.iv.next48 = add i64 %indvars.iv47, 1
+ br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ unreachable
+}
+
+;PR14701
+define void @start_model_rare() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %cond.false, label %cond.true
+
+cond.true:                                        ; preds = %if.end
+  unreachable
+
+cond.false:                                       ; preds = %if.end
+  br i1 undef, label %cond.false28, label %cond.true20
+
+cond.true20:                                      ; preds = %cond.false
+  unreachable
+
+cond.false28:                                     ; preds = %cond.false
+  br label %for.body40
+
+for.body40:                                       ; preds = %for.inc50, %cond.false28
+  %indvars.iv123 = phi i64 [ 3, %cond.false28 ], [ %indvars.iv.next124, %for.inc50 ]
+  %step.0121 = phi i32 [ 1, %cond.false28 ], [ %step.1, %for.inc50 ]
+  br i1 undef, label %if.then46, label %for.inc50
+
+if.then46:                                        ; preds = %for.body40
+  %inc47 = add nsw i32 %step.0121, 1
+  br label %for.inc50
+
+for.inc50:                                        ; preds = %if.then46, %for.body40
+  %k.1 = phi i32 [ undef, %for.body40 ], [ %inc47, %if.then46 ]
+  %step.1 = phi i32 [ %step.0121, %for.body40 ], [ %inc47, %if.then46 ]
+  %indvars.iv.next124 = add i64 %indvars.iv123, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next124 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end52, label %for.body40
+
+for.end52:                                        ; preds = %for.inc50
+  unreachable
+
+return:                                           ; preds = %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
new file mode 100644
index 00000000000..baf96b84a34
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s  -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4 
+
+; Check that we don't crash.
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+module asm "\09.ident\09\22GCC: (GNU) 4.6.3 LLVM: 3.2svn\22"
+
+@b = common global [32000 x float] zeroinitializer, align 16
+
+define i32 @set1ds(i32 %_n, float* nocapture %arr, float %value, i32 %stride) nounwind uwtable {
+entry:
+  %0 = icmp sgt i32 %_n, 0
+  br i1 %0, label %"3.lr.ph", label %"5"
+
+"3.lr.ph":                                        ; preds = %entry
+  %1 = bitcast float* %arr to i8*
+  %2 = sext i32 %stride to i64
+  br label %"3"
+
+"3":                                              ; preds = %"3.lr.ph", %"3"
+  %indvars.iv = phi i64 [ 0, %"3.lr.ph" ], [ %indvars.iv.next, %"3" ]
+  %3 = shl nsw i64 %indvars.iv, 2
+  %4 = getelementptr inbounds i8, i8* %1, i64 %3
+  %5 = bitcast i8* %4 to float*
+  store float %value, float* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, %2
+  %6 = trunc i64 %indvars.iv.next to i32
+  %7 = icmp slt i32 %6, %_n
+  br i1 %7, label %"3", label %"5"
+
+"5":                                              ; preds = %"3", %entry
+  ret i32 0
+}
+
+define i32 @init(i8* nocapture %name) unnamed_addr nounwind uwtable {
+entry:
+  br label %"3"
+
+"3":                                              ; preds = %"3", %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %"3" ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %1 = getelementptr inbounds i8, i8* bitcast (float* getelementptr inbounds ([32000 x float], [32000 x float]* @b, i64 0, i64 16000) to i8*), i64 %0
+  %2 = bitcast i8* %1 to float*
+  store float -1.000000e+00, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %"5", label %"3"
+
+"5":                                              ; preds = %"3"
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll b/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll
new file mode 100644
index 00000000000..f64dcb36f70
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/2016-07-27-loop-vec.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -loop-vectorize -S
+
+define void @foo() local_unnamed_addr {
+entry:
+  %exitcond = icmp eq i64 3, 3
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %total1 = add nsw i64 %i.05, 3
+  %inc = add nuw nsw i64 %i.05, 1
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
new file mode 100644
index 00000000000..eb12803a344
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -0,0 +1,79 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; This test checks that we correctly compute the scalarized operands for a
+; user-specified vectorization factor when interleaving is disabled. We use the
+; "optsize" attribute to disable all interleaving calculations.  A cost of 4
+; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
+; %tmp4 a lower scalarization overhead.
+;
+; COST-LABEL:  predicated_udiv_scalarized_operand
+; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+;
+; CHECK-LABEL: @predicated_udiv_scalarized_operand(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_UDIV_IF]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], %x
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE]]
+; CHECK:       [[PRED_UDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_IF1]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i64 [[TMP11]], %x
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp2 = load i64, i64* %tmp0, align 4
+  %cond0 = icmp sgt i64 %tmp2, 0
+  br i1 %cond0, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i64 %tmp2, %x
+  %tmp4 = udiv i64 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i64 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond1 = icmp slt i64 %i.next, 100
+  br i1 %cond1, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i64 [ %tmp6, %for.inc ]
+  ret i64 %tmp7
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
new file mode 100644
index 00000000000..a689f44e912
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-unroll.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: nounwind
+define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+entry:
+  %cmp10 = icmp sgt i32 %size, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret i32* %c
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
new file mode 100644
index 00000000000..81ecd57b2a8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll
@@ -0,0 +1,147 @@
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 | FileCheck %s
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnueabi"
+
+; Test integer induction variable of step 2:
+;   for (int i = 0; i < 1024; i+=2) {
+;     int tmp = *A++;
+;     sum += i * tmp;
+;   }
+
+; CHECK-LABEL: @ind_plus2(
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 512
+
+; FORCE-VEC-LABEL: @ind_plus2(
+; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
+; FORCE-VEC: mul nsw <2 x i32>
+; FORCE-VEC: add nsw <2 x i32>
+; FORCE-VEC: %index.next = add i64 %index, 2
+; FORCE-VEC: icmp eq i64 %index.next, 512
+define i32 @ind_plus2(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %add1, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
+  %0 = load i32, i32* %A.addr, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %add1 = add nsw i32 %i, 2
+  %cmp = icmp slt i32 %add1, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+}
+
+
+; Test integer induction variable of step -2:
+;   for (int i = 1024; i > 0; i-=2) {
+;     int tmp = *A++;
+;     sum += i * tmp;
+;   }
+
+; CHECK-LABEL: @ind_minus2(
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 512
+
+; FORCE-VEC-LABEL: @ind_minus2(
+; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
+; FORCE-VEC: mul nsw <2 x i32>
+; FORCE-VEC: add nsw <2 x i32>
+; FORCE-VEC: %index.next = add i64 %index, 2
+; FORCE-VEC: icmp eq i64 %index.next, 512
+define i32 @ind_minus2(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr, %for.body ]
+  %i = phi i32 [ 1024, %entry ], [ %sub, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
+  %0 = load i32, i32* %A.addr, align 4
+  %mul = mul nsw i32 %0, %i
+  %add = add nsw i32 %mul, %sum
+  %sub = add nsw i32 %i, -2
+  %cmp = icmp sgt i32 %i, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+}
+
+
+; Test pointer induction variable of step 2. As currently we don't support
+; masked load/store, vectorization is possible but not beneficial. If loop
+; vectorization is not enforced, LV will only do interleave.
+;   for (int i = 0; i < 1024; i++) {
+;     int tmp0 = *A++;
+;     int tmp1 = *A++;
+;     sum += tmp0 * tmp1;
+;   }
+
+; CHECK-LABEL: @ptr_ind_plus2(
+; CHECK: %[[V0:.*]] = load <8 x i32>
+; CHECK: %[[V1:.*]] = load <8 x i32>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: mul nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: %index.next = add i64 %index, 8
+; CHECK: icmp eq i64 %index.next, 1024
+
+; FORCE-VEC-LABEL: @ptr_ind_plus2(
+; FORCE-VEC: %[[V:.*]] = load <4 x i32>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; FORCE-VEC: mul nsw <2 x i32>
+; FORCE-VEC: add nsw <2 x i32>
+; FORCE-VEC: %index.next = add i64 %index, 2
+; FORCE-VEC: icmp eq i64 %index.next, 1024
+define i32 @ptr_ind_plus2(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %A.addr = phi i32* [ %A, %entry ], [ %inc.ptr1, %for.body ]
+  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc.ptr = getelementptr inbounds i32, i32* %A.addr, i64 1
+  %0 = load i32, i32* %A.addr, align 4
+  %inc.ptr1 = getelementptr inbounds i32, i32* %A.addr, i64 2
+  %1 = load i32, i32* %inc.ptr, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum
+  %inc = add nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
new file mode 100644
index 00000000000..395b468c509
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/arm64-unroll.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Function Attrs: nounwind
+define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+entry:
+  %cmp10 = icmp sgt i32 %size, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret i32* %c
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
new file mode 100644
index 00000000000..aba47f6c628
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/backedge-overflow.ll
@@ -0,0 +1,166 @@
+; RUN: opt -mtriple=aarch64--linux-gnueabi -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s -S | FileCheck %s
+
+; The following tests contain loops for which SCEV cannot determine the backedge
+; taken count. This is because the backedge taken condition is produced by an
+; icmp with one of the sides being a loop varying non-AddRec expression.
+; However, there is a possibility to normalize this to an AddRec expression
+; using SCEV predicates. This allows us to compute a 'guarded' backedge count.
+; The Loop Vectorizer is able to version to loop in order to use this guarded
+; backedge count and vectorize more loops.
+
+
+; CHECK-LABEL: test_sge
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.body
+define void @test_sge(i32* noalias %A,
+                      i32* noalias %B,
+                      i32* noalias %C, i32 %N) {
+entry:
+  %cmp13 = icmp eq i32 %N, 0
+  br i1 %cmp13, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ]
+  %indvars.next = add i16 %indvars.iv, 1
+  %indvars.ext = zext i16 %indvars.iv to i32
+
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
+  %1 = load i32, i32* %arrayidx3, align 4
+
+  %mul4 = mul i32 %1, %0
+
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
+  store i32 %mul4, i32* %arrayidx7, align 4
+
+  %exitcond = icmp sge i32 %indvars.ext, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: test_uge
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.body
+define void @test_uge(i32* noalias %A,
+                      i32* noalias %B,
+                      i32* noalias %C, i32 %N, i32 %Offset) {
+entry:
+  %cmp13 = icmp eq i32 %N, 0
+  br i1 %cmp13, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ 0, %for.body.preheader ]
+  %indvars.next = add i16 %indvars.iv, 1
+
+  %indvars.ext = sext i16 %indvars.iv to i32
+  %indvars.access = add i32 %Offset, %indvars.ext
+
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.access
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.access
+  %1 = load i32, i32* %arrayidx3, align 4
+
+  %mul4 = add i32 %1, %0
+
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.access
+  store i32 %mul4, i32* %arrayidx7, align 4
+
+  %exitcond = icmp uge i32 %indvars.ext, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: test_ule
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.body
+define void @test_ule(i32* noalias %A,
+                      i32* noalias %B,
+                      i32* noalias %C, i32 %N,
+                      i16 %M) {
+entry:
+  %cmp13 = icmp eq i32 %N, 0
+  br i1 %cmp13, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ]
+  %indvars.next = sub i16 %indvars.iv, 1
+  %indvars.ext = zext i16 %indvars.iv to i32
+
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
+  %1 = load i32, i32* %arrayidx3, align 4
+
+  %mul4 = mul i32 %1, %0
+
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
+  store i32 %mul4, i32* %arrayidx7, align 4
+
+  %exitcond = icmp ule i32 %indvars.ext, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: test_sle
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.body
+define void @test_sle(i32* noalias %A,
+                   i32* noalias %B,
+                   i32* noalias %C, i32 %N,
+                   i16 %M) {
+entry:
+  %cmp13 = icmp eq i32 %N, 0
+  br i1 %cmp13, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i16 [ %indvars.next, %for.body ], [ %M, %for.body.preheader ]
+  %indvars.next = sub i16 %indvars.iv, 1
+  %indvars.ext = sext i16 %indvars.iv to i32
+
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %indvars.ext
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %indvars.ext
+  %1 = load i32, i32* %arrayidx3, align 4
+
+  %mul4 = mul i32 %1, %0
+
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %indvars.ext
+  store i32 %mul4, i32* %arrayidx7, align 4
+
+  %exitcond = icmp sle i32 %indvars.ext, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
new file mode 100644
index 00000000000..65f5c4e6266
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S < %s -loop-vectorize -instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+;; See https://llvm.org/bugs/show_bug.cgi?id=25490
+;; Due to the data structures used, the LLVM IR was not determinisic.
+;; This test comes from the PR.
+
+;; CHECK-LABEL: @test(
+; CHECK: load <16 x i8>
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: bitcast
+; CHECK-NEXT: load <16 x i8>
+; CHECK-NEXT: zext <16 x i8>
+; CHECK-NEXT: zext <16 x i8>
+define void @test(i32 %n, i8* nocapture %a, i8* nocapture %b, i8* nocapture readonly %c) {
+entry:
+  %cmp.28 = icmp eq i32 %n, 0
+  br i1 %cmp.28, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %c, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %mul = mul nuw nsw i32 %conv3, %conv
+  %shr.26 = lshr i32 %mul, 8
+  %conv4 = trunc i32 %shr.26 to i8
+  store i8 %conv4, i8* %arrayidx2, align 1
+  %arrayidx8 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %2 = load i8, i8* %arrayidx8, align 1
+  %conv9 = zext i8 %2 to i32
+  %mul10 = mul nuw nsw i32 %conv9, %conv
+  %shr11.27 = lshr i32 %mul10, 8
+  %conv12 = trunc i32 %shr11.27 to i8
+  store i8 %conv12, i8* %arrayidx8, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
new file mode 100644
index 00000000000..a40dafe6ec2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-cost.ll
@@ -0,0 +1,85 @@
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+
+@kernel = global [512 x float] zeroinitializer, align 16
+@kernel2 = global [512 x float] zeroinitializer, align 16
+@kernel3 = global [512 x float] zeroinitializer, align 16
+@kernel4 = global [512 x float] zeroinitializer, align 16
+@src_data = global [1536 x float] zeroinitializer, align 16
+@r_ = global i8 0, align 1
+@g_ = global i8 0, align 1
+@b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive.
+; Make sure we don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+  %cmp53 = icmp eq i64 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i64 %v.055, %offset
+  %mul = mul i64 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float, float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i64 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float, float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i64 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float, float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i64 %v.055, 1
+  %exitcond = icmp ne i64 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 1
+  store i8 %g.0.lcssa, i8* @g_, align 1
+  store i8 %b.0.lcssa, i8* @b_, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
new file mode 100644
index 00000000000..e8ef4256235
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @non_primary_iv_trunc_free(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @non_primary_iv_trunc_free(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = trunc i64 %i to i32
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
new file mode 100644
index 00000000000..10405b8cd16
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; This test shows extremely high interleaving cost that, probably, should be fixed.
+; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
+; the load instructions.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%pair = type { i8, i8 }
+
+; CHECK-LABEL: test
+; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: vector.body
+; CHECK: load i8
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @test(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i8, i8* %tmp0, align 1
+  %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
new file mode 100644
index 00000000000..54ee8fc6e73
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -0,0 +1,189 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnueabi"
+
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i64.2 = type {i64, i64}
+define void @i64_factor_2(%i64.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64}
+define void @i64_factor_8(%i64.8* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; The interleave factor in this test is 8, which is greater than the maximum
+; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI
+; implementation for determining the cost of the interleaved load group. The
+; stores do not form a legal interleaved group because the group would contain
+; gaps.
+;
+; VF_2-LABEL: Checking a loop in "i64_factor_8"
+; VF_2:         Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
+  %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..937cffb2c11
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
new file mode 100644
index 00000000000..1149afe7b9f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -0,0 +1,310 @@
+; RUN: opt -S < %s -basicaa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; CHECK-LABEL: @add_a(
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add <16 x i8>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i8
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv1, i8* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
+; working with.
+; CHECK-LABEL: @add_a1(
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add nuw nsw <16 x i8>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %add = add nuw nsw i8 %0, 2
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_b(
+; CHECK: load <8 x i16>, <8 x i16>*
+; CHECK: add <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp9 = icmp sgt i32 %len, 0
+  br i1 %cmp9, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx
+  %conv8 = zext i16 %0 to i32
+  %add = add nuw nsw i32 %conv8, 2
+  %conv1 = trunc i32 %add to i16
+  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+  store i16 %conv1, i16* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_c(
+; CHECK: load <8 x i8>, <8 x i8>*
+; CHECK: add <8 x i16>
+; CHECK: store <8 x i16>
+; Function Attrs: nounwind
+define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i16
+  %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
+  store i16 %conv1, i16* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_d(
+; CHECK: load <4 x i16>
+; CHECK: add nsw <4 x i32>
+; CHECK: store <4 x i32>
+define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp7 = icmp sgt i32 %len, 0
+  br i1 %cmp7, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx
+  %conv = sext i16 %0 to i32
+  %add = add nsw i32 %conv, 2
+  %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_e(
+; CHECK: load <16 x i8>
+; CHECK: shl <16 x i8>
+; CHECK: add <16 x i8>
+; CHECK: or <16 x i8>
+; CHECK: mul <16 x i8>
+; CHECK: and <16 x i8>
+; CHECK: xor <16 x i8>
+; CHECK: mul <16 x i8>
+; CHECK: store <16 x i8>
+define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
+entry:
+  %cmp.32 = icmp sgt i32 %len, 0
+  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv11 = zext i8 %arg2 to i32
+  %conv13 = zext i8 %arg1 to i32
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = shl i32 %conv, 4
+  %conv2 = add nuw nsw i32 %add, 32
+  %or = or i32 %conv, 51
+  %mul = mul nuw nsw i32 %or, 60
+  %and = and i32 %conv2, %conv13
+  %mul.masked = and i32 %mul, 252
+  %conv17 = xor i32 %mul.masked, %conv11
+  %mul18 = mul nuw nsw i32 %conv17, %and
+  %conv19 = trunc i32 %mul18 to i8
+  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv19, i8* %arrayidx21
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_f
+; CHECK: load <8 x i16>
+; CHECK: trunc <8 x i16>
+; CHECK: shl <8 x i8>
+; CHECK: add <8 x i8>
+; CHECK: or <8 x i8>
+; CHECK: mul <8 x i8>
+; CHECK: and <8 x i8>
+; CHECK: xor <8 x i8>
+; CHECK: mul <8 x i8>
+; CHECK: store <8 x i8>
+define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
+entry:
+  %cmp.32 = icmp sgt i32 %len, 0
+  br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv11 = zext i8 %arg2 to i32
+  %conv13 = zext i8 %arg1 to i32
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx
+  %conv = sext i16 %0 to i32
+  %add = shl i32 %conv, 4
+  %conv2 = add nsw i32 %add, 32
+  %or = and i32 %conv, 204
+  %conv8 = or i32 %or, 51
+  %mul = mul nuw nsw i32 %conv8, 60
+  %and = and i32 %conv2, %conv13
+  %mul.masked = and i32 %mul, 252
+  %conv17 = xor i32 %mul.masked, %conv11
+  %mul18 = mul nuw nsw i32 %conv17, %and
+  %conv19 = trunc i32 %mul18 to i8
+  %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv19, i8* %arrayidx21
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: @add_phifail(
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add nuw nsw <16 x i32>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i8
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv1, i8* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: nounwind
+; When we vectorize this loop, we generate correct code
+; even when %len exactly divides VF (since we extract from the second last index
+; and pass this to the for.cond.cleanup block). Vectorized loop returns 
+; the correct value a_phi = p[len -2]
+define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+; CHECK-LABEL: @add_phifail2(
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <16 x i8>, <16 x i8>*
+; CHECK:   %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
+; CHECK:   add nuw nsw <16 x i32>
+; CHECK:   store <16 x i8>
+; CHECK:   add i64 %index, 16
+; CHECK:   icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK:   %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
+; CHECK:   %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
+; CHECK: for.cond.cleanup:
+; CHECK:   %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK:   %ret = trunc i32 %a_phi.lcssa to i8
+; CHECK:   ret i8 %ret
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %ret = trunc i32 %a_phi to i8
+  ret i8 %ret
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %conv = zext i8 %0 to i32
+  %add = add nuw nsw i32 %conv, 2
+  %conv1 = trunc i32 %add to i8
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %conv1, i8* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind }
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll b/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll
new file mode 100644
index 00000000000..8b9589aebba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/max-vf-for-interleaved.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -force-vector-interleave=1 -store-to-load-forwarding-conflict-detection=false -loop-vectorize -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%struct.pair = type { i32, i32 }
+
+; Check vectorization of interleaved access groups with positive dependence
+; distances. In this test, the maximum safe dependence distance for
+; vectorization is 16 bytes. Normally, this would lead to a maximum VF of 4.
+; However, for interleaved groups, the effective VF is VF * IF, where IF is the
+; interleave factor. Here, the maximum safe dependence distance is recomputed
+; as 16 / IF bytes, resulting in VF=2. Since IF=2, we should generate <4 x i32>
+; loads and stores instead of <8 x i32> accesses.
+;
+; Note: LAA's conflict detection optimization has to be disabled for this test
+;       to be vectorized.
+
+; struct pair {
+;   int x;
+;   int y;
+; };
+;
+; void max_vf(struct pair *restrict p) {
+;   for (int i = 0; i < 1000; i++) {
+;     p[i + 2].x = p[i].x
+;     p[i + 2].y = p[i].y
+;   }
+; }
+
+; CHECK-LABEL: @max_vf
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+
+define void @max_vf(%struct.pair* noalias nocapture %p) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %0 = add nuw nsw i64 %i, 2
+  %p_i.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 0
+  %p_i_plus_2.x = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 0
+  %1 = load i32, i32* %p_i.x, align 4
+  store i32 %1, i32* %p_i_plus_2.x, align 4
+  %p_i.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %i, i32 1
+  %p_i_plus_2.y = getelementptr inbounds %struct.pair, %struct.pair* %p, i64 %0, i32 1
+  %2 = load i32, i32* %p_i.y, align 4
+  store i32 %2, i32* %p_i_plus_2.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 1000
+  br i1 %cond, label %for.exit, label %for.body
+
+for.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
new file mode 100644
index 00000000000..247ea35ff5d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -0,0 +1,49 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: all_scalar
+; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
+;
+define void @all_scalar(i64* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr i64, i64* %a, i64 %i
+  store i64 0, i64* %tmp0, align 1
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: PR33193
+; CHECK:       LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
+; CHECK:       LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
+; CHECK:       LV: Not considering vector loop of width 8 because it will not generate any vector instructions
+%struct.a = type { i32, i8 }
+define void @PR33193(%struct.a* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
+  %tmp0 = getelementptr inbounds %struct.a, %struct.a* %a, i64 %i, i32 1
+  store i8 0, i8* %tmp0, align 4
+  %j.next = add i32 %j, 1
+  %i.next = zext i32 %j.next to i64
+  %cond = icmp ugt i64 %n, %i.next
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
new file mode 100644
index 00000000000..aa8478b0b6a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_test1_no_explicit_vect_width.ll
@@ -0,0 +1,144 @@
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple aarch64-gnu-linux < %s | FileCheck %s
+
+; extern int arr[8][8];
+; extern int arr2[8];
+;
+; void foo(int n)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable)
+;   for (i1 = 0; i1 < 8; i1++) {
+;     arr2[i1] = i1;
+;     for (i2 = 0; i2 < 8; i2++)
+;       arr[i2][i1] = i1 + n;
+;   }
+; }
+;
+
+; CHECK-LABEL: @foo_i32(
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+@arr2 = external global [8 x i32], align 16
+@arr = external global [8 x [8 x i32]], align 16
+
+@arrX = external global [8 x i64], align 16
+@arrY = external global [8 x [8 x i64]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo_i32(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+; CHECK-LABEL: @foo_i64(
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <2 x i64> undef, i64 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <2 x i64> %[[SplatVal]], <2 x i64> undef, <2 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, <2 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[VecInd]], <2 x i64*> %[[AAddr]], i32 4, <2 x i1> <i1 true, i1 true>)
+; CHECK: %[[StoreVal:.*]] = add nsw <2 x i64> %[[VecInd]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <2 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, <2 x i64> %[[InnerPhi]], <2 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> %[[StoreVal]], <2 x i64*> %[[AAddr2]], i32 4, <2 x i1> <i1 true, i1 true>
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <2 x i64> %[[InnerPhi]], <i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <2 x i64> %[[InnerPhiNext]], <i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <2 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 2
+; CHECK: %[[VecIndNext]] = add <2 x i64> %[[VecInd]], <i64 2, i64 2>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+; Function Attrs: norecurse nounwind uwtable
+define void @foo_i64(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i64], [8 x i64]* @arrX, i64 0, i64 %indvars.iv21
+  store i64 %indvars.iv21, i64* %arrayidx, align 4
+  %add = add nsw i64 %indvars.iv21, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i64]], [8 x [8 x i64]]* @arrY, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i64 %add, i64* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll
new file mode 100644
index 00000000000..5ea38a4a246
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr31900.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s
+
+; Reproducer for address space fault in the LoopVectorizer (pr31900). Added
+; different sized address space pointers (p:16:16-p4:32:16) to the aarch64
+; datalayout to reproduce the fault.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16"
+
+; Check that all the loads are scalarized
+; CHECK: load i16, i16*
+; CHECK: load i16, i16*
+; CHECK: load i16, i16 addrspace(4)*
+; CHECK: load i16, i16 addrspace(4)*
+
+%rec1445 = type { i16, i16, i16, i16, i16 }
+
+define void @foo() {
+bb1:
+  br label %bb4
+
+bb4:
+  %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ]
+  %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ]
+  %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ]
+  %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1
+  %_tmp987 = load i16, i16* %0, align 1
+  %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1
+  %_tmp993 = load i16, i16 addrspace(4)* %1, align 1
+  %_tmp1013 = add i16 %tmp1, 1
+  %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1
+  %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1
+  %_tmp1019 = icmp ult i16 %_tmp1013, 24
+  br i1 %_tmp1019, label %bb4, label %bb16
+
+bb16:
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
new file mode 100644
index 00000000000..6763940bf98
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -mtriple=aarch64 -loop-vectorize -force-vector-width=2 < %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+@b = common local_unnamed_addr global i32 0, align 4
+@a = common local_unnamed_addr global i16* null, align 8
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @fn1() local_unnamed_addr #0 {
+; Ensure that we don't emit reduction intrinsics for unsupported short reductions.
+; CHECK-NOT: @llvm.experimental.vector.reduce
+entry:
+  %0 = load i32, i32* @b, align 4, !tbaa !1
+  %cmp40 = icmp sgt i32 %0, 0
+  br i1 %cmp40, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = load i16*, i16** @a, align 8, !tbaa !5
+  %2 = load i32, i32* @b, align 4, !tbaa !1
+  %3 = sext i32 %2 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %d.043 = phi i16 [ undef, %for.body.lr.ph ], [ %.sink28, %for.body ]
+  %c.042 = phi i16 [ undef, %for.body.lr.ph ], [ %c.0., %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %1, i64 %indvars.iv
+  %4 = load i16, i16* %arrayidx, align 2, !tbaa !7
+  %cmp2 = icmp sgt i16 %c.042, %4
+  %c.0. = select i1 %cmp2, i16 %c.042, i16 %4
+  %cmp13 = icmp slt i16 %d.043, %4
+  %.sink28 = select i1 %cmp13, i16 %d.043, i16 %4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv.next, %3
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  %c.0.lcssa = phi i16 [ undef, %entry ], [ %c.0., %for.body ]
+  %d.0.lcssa = phi i16 [ undef, %entry ], [ %.sink28, %for.body ]
+  %cmp26 = icmp sgt i16 %c.0.lcssa, %d.0.lcssa
+  %conv27 = zext i1 %cmp26 to i32
+  ret i32 %conv27
+}
+
+attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+!llvm.ident = !{!0}
+
+!0 = !{!"clang"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !3, i64 0}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"short", !3, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
new file mode 100644
index 00000000000..c51c6c98ddf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mtriple=aarch64-unknown-linux-gnu -force-vector-interleave=1 -force-vector-width=4 < %s | FileCheck %s
+
+; The test checks that there is no assert caused by issue described in PR36032
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+%struct.anon = type { i8 }
+
+@c = local_unnamed_addr global [6 x i8] zeroinitializer, align 1
+@b = internal global %struct.anon zeroinitializer, align 1
+
+; Function Attrs: noreturn nounwind
+define void @_Z1dv() local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z1dv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i8* @"_ZN3$_01aEv"(%struct.anon* nonnull @b)
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[F_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD5:%.*]], [[FOR_COND_CLEANUP:%.*]] ]
+; CHECK-NEXT:    [[G_0:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[G_1_LCSSA:%.*]], [[FOR_COND_CLEANUP]] ]
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[G_0]], 4
+; CHECK-NEXT:    [[CONV:%.*]] = and i32 [[F_0]], 65535
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[G_0]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 4, [[TMP0]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 3, [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[G_0]], [[CONV]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP4]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP3]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP3]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 false, i1 [[TMP7]], i1 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP2]], 4294967295
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i1 false, [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8* [[CALL]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[G_0]], [[CONV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = sub i64 [[TMP15]], [[TMP0]]
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP16]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[TMP18]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <4 x i32> [[BROADCAST_SPLAT6]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[G_1_LCSSA]] = phi i32 [ [[G_0]], [[FOR_COND]] ], [ 4, [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    [[ADD5]] = add nuw nsw i32 [[CONV]], 4
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[CONV]], [[TMP29]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[TMP30]], i8* [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7
+;
+entry:
+  %call = tail call i8* @"_ZN3$_01aEv"(%struct.anon* nonnull @b) #2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.cleanup, %entry
+  %f.0 = phi i32 [ 0, %entry ], [ %add5, %for.cond.cleanup ]
+  %g.0 = phi i32 [ undef, %entry ], [ %g.1.lcssa, %for.cond.cleanup ]
+  %cmp12 = icmp ult i32 %g.0, 4
+  %conv = and i32 %f.0, 65535
+  br i1 %cmp12, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %for.cond
+  %0 = zext i32 %g.0 to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %for.cond
+  %g.1.lcssa = phi i32 [ %g.0, %for.cond ], [ 4, %for.cond.cleanup.loopexit ]
+  %add5 = add nuw nsw i32 %conv, 4
+  br label %for.cond
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %1 = trunc i64 %indvars.iv to i32
+  %add = add i32 %conv, %1
+  %idxprom = zext i32 %add to i64
+  %arrayidx = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 %idxprom
+  %2 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %call, i64 %indvars.iv
+  store i8 %2, i8* %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i8* @"_ZN3$_01aEv"(%struct.anon*) local_unnamed_addr #1
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
new file mode 100644
index 00000000000..b0ebb4edf2a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -0,0 +1,231 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; Check predication-related cost calculations, including scalarization overhead
+; and block probability scaling. Note that the functionality being tested is
+; not specific to AArch64. We specify a target to get actual values for the
+; instruction costs.
+
+; CHECK-LABEL: predicated_udiv
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction. If we assume the block probability is 50%, we compute the cost
+; as:
+;
+; Cost of udiv:
+;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+;
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
+
+; CHECK-LABEL: predicated_store
+;
+; This test checks that we correctly compute the cost of the predicated store
+; instruction. If we assume the block probability is 50%, we compute the cost
+; as:
+;
+; Cost of store:
+;   (store(4) + extractelement(3)) / 2 = 3
+;
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+;
+define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add nsw i32 %tmp1, %x
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  store i32 %tmp2, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block.  If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   (add(2) + extractelement(3)) / 2 = 2
+; Cost of udiv:
+;   (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4
+;
+; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x
+; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i32 %tmp2, %x
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
+
+; CHECK-LABEL: predicated_store_scalarized_operand
+;
+; This test checks that we correctly compute the cost of the predicated store
+; instruction and the add instruction it uses. The add is scalarized and sunk
+; inside the predicated block.  If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   (add(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+;   store(4) / 2 = 2
+;
+; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x
+; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4
+;
+define void @predicated_store_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = add nsw i32 %tmp1, %x
+  store i32 %tmp2, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: predication_multi_context
+;
+; This test checks that we correctly compute the cost of multiple predicated
+; instructions in the same block. The sdiv, udiv, and store must be scalarized
+; and predicated. The sub feeding the store is scalarized and sunk inside the
+; store's predicated block. However, the add feeding the sdiv and udiv cannot
+; be sunk and is not scalarized. If we assume the block probability is 50%, we
+; compute the cost as:
+;
+; Cost of add:
+;   add(1) = 1
+; Cost of sdiv:
+;   (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of udiv:
+;   (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5
+; Cost of sub:
+;   (sub(2) + extractelement(3)) / 2 = 2
+; Cost of store:
+;   store(4) / 2 = 2
+;
+; CHECK-NOT: Scalarizing: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Scalarizing and predicating: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Scalarizing and predicating: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Scalarizing: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Scalarizing and predicating: store i32 %tmp5, i32* %tmp0, align 4
+; CHECK:     Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2
+; CHECK:     Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2
+; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x
+; CHECK:     Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, i32* %tmp0, align 4
+;
+define void @predication_multi_context(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = add i32 %tmp1, %x
+  %tmp3 = sdiv i32 %tmp1, %tmp2
+  %tmp4 = udiv i32 %tmp3, %tmp2
+  %tmp5 = sub i32 %tmp4, %x
+  store i32 %tmp5, i32* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
new file mode 100644
index 00000000000..9d9aea00e9a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -0,0 +1,171 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @reduction_i8
+;
+; char reduction_i8(char *a, char *b, int n) {
+;   char sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+; CHECK: vector.body:
+; CHECK:   phi <16 x i8>
+; CHECK:   load <16 x i8>
+; CHECK:   load <16 x i8>
+; CHECK:   add <16 x i8>
+; CHECK:   add <16 x i8>
+;
+; CHECK: middle.block:
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>
+; CHECK:   zext i8 [[Rdx]] to i32
+;
+define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.12 = icmp sgt i32 %n, 0
+  br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i8 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %conv4 = and i32 %sum.013, 255
+  %add = add nuw nsw i32 %conv, %conv4
+  %add5 = add nuw nsw i32 %add, %conv3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}
+
+; CHECK-LABEL: @reduction_i16_1
+;
+; short reduction_i16_1(short *a, short *b, int n) {
+;   short sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+; CHECK: vector.body:
+; CHECK:   phi <8 x i16>
+; CHECK:   load <8 x i16>
+; CHECK:   load <8 x i16>
+; CHECK:   add <8 x i16>
+; CHECK:   add <8 x i16>
+;
+; CHECK: middle.block:
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
+; CHECK:   zext i16 [[Rdx]] to i32
+;
+define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.16 = icmp sgt i32 %n, 0
+  br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i16 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.017 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %a, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv.14 = zext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %b, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 2
+  %conv3.15 = zext i16 %1 to i32
+  %conv4.13 = and i32 %sum.017, 65535
+  %add = add nuw nsw i32 %conv.14, %conv4.13
+  %add5 = add nuw nsw i32 %add, %conv3.15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}
+
+; CHECK-LABEL: @reduction_i16_2
+;
+; short reduction_i16_2(char *a, char *b, int n) {
+;   short sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+; CHECK: vector.body:
+; CHECK:   phi <8 x i16>
+; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8>
+; CHECK:   zext <8 x i8> [[Ld1]] to <8 x i16>
+; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8>
+; CHECK:   zext <8 x i8> [[Ld2]] to <8 x i16>
+; CHECK:   add <8 x i16>
+; CHECK:   add <8 x i16>
+;
+; CHECK: middle.block:
+; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>
+; CHECK:   zext i16 [[Rdx]] to i32
+;
+define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.14 = icmp sgt i32 %n, 0
+  br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i16 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i16 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.015 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %conv4.13 = and i32 %sum.015, 65535
+  %add = add nuw nsw i32 %conv, %conv4.13
+  %add5 = add nuw nsw i32 %add, %conv3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll
new file mode 100644
index 00000000000..f3c6548c68e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sdiv-pow2.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+
+@Foo = common global %struct.anon zeroinitializer, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: sdiv <4 x i32>
+; CHECK: store <4 x i32>
+
+define void @foo(){
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %div = sdiv i32 %0, 2
+  %arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  store i32 %div, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
new file mode 100644
index 00000000000..1ae7dadeffd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: Checking a loop in "interleaved_access"
+; CHECK:         The Smallest and Widest types: 64 / 64 bits
+;
+define void @interleaved_access(i8** %A, i64 %N) {
+for.ph:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ]
+  %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i
+  store i8* null, i8** %tmp0, align 8
+  %i.next.0 = add nuw nsw i64 %i, 1
+  %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0
+  store i8* null, i8** %tmp1, align 8
+  %i.next.1 = add nsw i64 %i, 2
+  %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1
+  store i8* null, i8** %tmp2, align 8
+  %i.next.2 = add nsw i64 %i, 3
+  %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2
+  store i8* null, i8** %tmp3, align 8
+  %i.next.3 = add nsw i64 %i, 4
+  %cond = icmp slt i64 %i.next.3, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
new file mode 100644
index 00000000000..ffe8480138d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S < %s -loop-vectorize -force-vector-width=4 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: test0
+define void @test0(i16* noalias %M3) {
+entry:
+  br label %if.then1165.us
+
+if.then1165.us:                                   ; preds = %if.then1165.us, %entry
+  %indvars.iv1783 = phi i64 [ 0, %entry ], [ %indvars.iv.next1784, %if.then1165.us ]
+  %conv1177.us = zext i16 undef to i32
+  %add1178.us = add nsw i32 %conv1177.us, undef
+  %conv1179.us = trunc i32 %add1178.us to i16
+  %idxprom1181.us = ashr exact i64 undef, 32
+  %arrayidx1185.us = getelementptr inbounds i16, i16* %M3, i64 %idxprom1181.us
+  store i16 %conv1179.us, i16* %arrayidx1185.us, align 2
+  %indvars.iv.next1784 = add nuw nsw i64 %indvars.iv1783, 1
+  %exitcond1785 = icmp eq i64 %indvars.iv.next1784, 16
+  br i1 %exitcond1785, label %for.inc1286.loopexit, label %if.then1165.us
+
+for.inc1286.loopexit:                             ; preds = %if.then1165.us
+  ret void
+}
+
+; CHECK-LABEL: test1
+define void @test1(i16* noalias %M3) {
+entry:
+  br label %if.then1165.us
+
+if.then1165.us:                                   ; preds = %if.then1165.us, %entry
+  %indvars.iv1783 = phi i64 [ 0, %entry ], [ %indvars.iv.next1784, %if.then1165.us ]
+  %fptr = load i32, i32* undef, align 4
+  %conv1177.us = zext i16 undef to i32
+  %add1178.us = add nsw i32 %conv1177.us, %fptr
+  %conv1179.us = trunc i32 %add1178.us to i16
+  %idxprom1181.us = ashr exact i64 undef, 32
+  %arrayidx1185.us = getelementptr inbounds i16, i16* %M3, i64 %idxprom1181.us
+  store i16 %conv1179.us, i16* %arrayidx1185.us, align 2
+  %indvars.iv.next1784 = add nuw nsw i64 %indvars.iv1783, 1
+  %exitcond1785 = icmp eq i64 %indvars.iv.next1784, 16
+  br i1 %exitcond1785, label %for.inc1286.loopexit, label %if.then1165.us
+
+for.inc1286.loopexit:                             ; preds = %if.then1165.us
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll
new file mode 100644
index 00000000000..91a916798c5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/divergent-runtime-check.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -simplifycfg < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -loop-vectorize -pass-remarks-analysis='loop-vectorize' < %s 2>&1 | FileCheck -check-prefixes=REMARK %s
+
+; GCN-LABEL: @runtime_check_divergent_target(
+; GCN-NOT: load <2 x half>
+; GCN-NOT: store <2 x half>
+
+; REMARK: remark: <unknown>:0:0: loop not vectorized: runtime pointer checks needed. Not enabled for divergent target
+define amdgpu_kernel void @runtime_check_divergent_target(half addrspace(1)* nocapture %a, half addrspace(1)* nocapture %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds half, half addrspace(1)* %b, i64 %indvars.iv
+  %load = load half, half addrspace(1)* %arrayidx, align 4
+  %mul = fmul half %load, 3.0
+  %arrayidx2 = getelementptr inbounds half, half addrspace(1)* %a, i64 %indvars.iv
+  store half %mul, half addrspace(1)* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg
new file mode 100644
index 00000000000..2a665f06be7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
new file mode 100644
index 00000000000..832843983eb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@@ -0,0 +1,34 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s  -loop-vectorize -dce -instcombine -S | FileCheck -check-prefix=CIVI -check-prefix=GCN %s
+
+; GCN-LABEL: @vectorize_v2f16_loop(
+; GFX9: vector.body:
+; GFX9: phi <2 x half>
+; GFX9: load <2 x half>
+; GFX9: fadd fast <2 x half>
+
+; GFX9: middle.block:
+; GFX9: fadd fast <2 x half>
+
+; VI: phi half
+; VI: phi load half
+; VI: fadd fast half
+define half @vectorize_v2f16_loop(half addrspace(1)* noalias %s) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %q.04 = phi half [ 0.0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds half, half addrspace(1)* %s, i64 %indvars.iv
+  %0 = load half, half addrspace(1)* %arrayidx, align 2
+  %add = fadd fast half %q.04, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  %add.lcssa = phi half [ %add, %for.body ]
+  ret half %add.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
new file mode 100644
index 00000000000..f303ed5377e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize < %s | FileCheck %s
+
+
+; For AMDGPU, loop unroll in loop vectorizer is disabled when VF==1.
+;
+; CHECK-LABEL: @small_loop(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: ret
+define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
+entry:
+  %0 = icmp sgt i32 %size, 0
+  br i1 %0, label %loop, label %exit
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ]
+  %1 = getelementptr inbounds i32, i32* %inArray, i32 %iv
+  %2 = load i32, i32* %1, align 4
+  %3 = add nsw i32 %2, 6
+  store i32 %3, i32* %1, align 4
+  %iv1 = add i32 %iv, 1
+;  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %cond = icmp eq i32 %iv1, %size
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
new file mode 100644
index 00000000000..369568f6dfa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -0,0 +1,330 @@
+; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
+; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
+; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
+; REQUIRES: asserts
+
+; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
+; regarding IEEE 754 standard.
+; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
+; because NEON is not IEEE compliant.
+; Darwin, on the other hand, doesn't support subnormals, and all optimizations
+; are allowed, even without -ffast-math.
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi"
+; CHECK: We can vectorize this loop!
+define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "sumf"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "sumf"
+; DARWIN: We can vectorize this loop!
+define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi"
+; CHECK: We can vectorize this loop!
+define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops need fast-math to be vectorizeable
+; LINUX: Checking a loop in "redf"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "redf"
+; DARWIN: We can vectorize this loop!
+define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %add = fadd float %Red.06, %mul
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; LINUX: Checking a loop in "fabs"
+; LINUX: Potentially unsafe FP op prevents vectorization
+; DARWIN: Checking a loop in "fabs"
+; DARWIN: We can vectorize this loop!
+define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call float @fabsf(float %1) #1
+  %conv3 = fmul float %0, %fabsf
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "sumi_fast"
+; CHECK: We can vectorize this loop!
+define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "sumf_fast"
+; CHECK: We can vectorize this loop!
+define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06
+  store float %mul, float* %arrayidx2, align 4
+  %inc = add nuw nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Integer loops are always vectorizeable
+; CHECK: Checking a loop in "redi_fast"
+; CHECK: We can vectorize this loop!
+define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07
+  %1 = load i32, i32* %arrayidx1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %Red.0.lcssa
+}
+
+; Floating-point loops can be vectorizeable with fast-math
+; CHECK: Checking a loop in "redf_fast"
+; CHECK: We can vectorize this loop!
+define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) {
+entry:
+  %cmp5 = icmp eq i32 %N, 0
+  br i1 %cmp5, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07
+  %1 = load float, float* %arrayidx1, align 4
+  %mul = fmul fast float %1, %0
+  %add = fadd fast float %mul, %Red.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi float [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret float %Red.0.lcssa
+}
+
+; Make sure calls that turn into builtins are also covered
+; CHECK: Checking a loop in "fabs_fast"
+; CHECK: We can vectorize this loop!
+define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp10 = icmp eq i32 %N, 0
+  br i1 %cmp10, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011
+  %1 = load float, float* %arrayidx1, align 4
+  %fabsf = tail call fast float @fabsf(float %1) #2
+  %conv3 = fmul fast float %fabsf, %0
+  %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011
+  store float %conv3, float* %arrayidx4, align 4
+  %inc = add nuw nsw i32 %i.011, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @fabsf(float)
+
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
new file mode 100644
index 00000000000..7b09913636f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-unroll.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFT
+; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S | FileCheck %s --check-prefix=SWIFTUNROLL
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK-LABEL: @foo(
+;CHECK: load <4 x i32>
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret
+;SWIFT-LABEL: @foo(
+;SWIFT: load <4 x i32>
+;SWIFT: load <4 x i32>
+;SWIFT: ret
+define i32 @foo(i32* nocapture %A, i32 %n) nounwind readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, %sum.01
+  %5 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+; Verify the register limit. On arm we don't have 16 allocatable registers.
+;SWIFTUNROLL-LABEL: @register_limit(
+;SWIFTUNROLL: load i32
+;SWIFTUNROLL-NOT: load i32
+define i32 @register_limit(i32* nocapture %A, i32 %n) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:
+  %i.02 = phi i32 [ %5, %.lr.ph ], [ 0, %0 ]
+  %sum.01 = phi i32 [ %4, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %6, %.lr.ph ], [ 0, %0 ]
+  %sum.03 = phi i32 [ %7, %.lr.ph ], [ 0, %0 ]
+  %sum.04 = phi i32 [ %8, %.lr.ph ], [ 0, %0 ]
+  %sum.05 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %sum.06 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i32 %i.02
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, %sum.01
+  %5 = add nsw i32 %i.02, 1
+  %6 = add nsw i32 %3, %sum.02
+  %7 = add nsw i32 %3, %sum.03
+  %8 = add nsw i32 %3, %sum.04
+  %9 = add nsw i32 %3, %sum.05
+  %10 = add nsw i32 %3, %sum.05
+  %exitcond = icmp eq i32 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %4, %.lr.ph ]
+  %sum.1.lcssa = phi i32 [ 0, %0 ], [ %6, %.lr.ph ]
+  %sum.2.lcssa = phi i32 [ 0, %0 ], [ %7, %.lr.ph ]
+  %sum.4.lcssa = phi i32 [ 0, %0 ], [ %8, %.lr.ph ]
+  %sum.5.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  %sum.6.lcssa = phi i32 [ 0, %0 ], [ %10, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
new file mode 100644
index 00000000000..6d1fa6f36a9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/gather-cost.ll
@@ -0,0 +1,88 @@
+; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+@kernel = global [512 x float] zeroinitializer, align 4
+@kernel2 = global [512 x float] zeroinitializer, align 4
+@kernel3 = global [512 x float] zeroinitializer, align 4
+@kernel4 = global [512 x float] zeroinitializer, align 4
+@src_data = global [1536 x float] zeroinitializer, align 4
+@r_ = global i8 0, align 4
+@g_ = global i8 0, align 4
+@b_ = global i8 0, align 4
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive. This function represents a point where vectorization starts to
+; become beneficial.
+; Make sure we are conservative and don't vectorize it.
+; CHECK-NOT: <2 x float>
+; CHECK-NOT: <4 x float>
+
+define void @_Z4testmm(i32 %size, i32 %offset) {
+entry:
+  %cmp53 = icmp eq i32 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i32 %v.055, %offset
+  %mul = mul i32 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %mul
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 %v.055
+  %1 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 %v.055
+  %2 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 %v.055
+  %3 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 %v.055
+  %4 = load float, float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i32 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum
+  %5 = load float, float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i32 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 %arrayidx.sum52
+  %6 = load float, float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i32 %v.055, 1
+  %exitcond = icmp ne i32 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 4
+  store i8 %g.0.lcssa, i8* @g_, align 4
+  store i8 %b.0.lcssa, i8* @b_, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
new file mode 100644
index 00000000000..783156d7399
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/gcc-examples.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift -S -dce | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
new file mode 100644
index 00000000000..29adec049f6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -0,0 +1,147 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "armv8--linux-gnueabihf"
+
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL:  Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%half.2 = type {half, half}
+define void @half_factor_2(%half.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "half_factor_2"
+; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "half_factor_2"
+; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
+  %tmp2 = load half, half* %tmp0, align 2
+  %tmp3 = load half, half* %tmp1, align 2
+  store half 0., half* %tmp0, align 2
+  store half 0., half* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
new file mode 100644
index 00000000000..98c6700c209
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
new file mode 100644
index 00000000000..e88fcca1225
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mul-cast-vect.ll
@@ -0,0 +1,114 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
+; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
+
+; ModuleID = 'arm.ll'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7--linux-gnueabihf"
+
+%T216 = type <2 x i16>
+%T232 = type <2 x i32>
+%T264 = type <2 x i64>
+
+%T416 = type <4 x i16>
+%T432 = type <4 x i32>
+%T464 = type <4 x i64>
+
+define void @direct(%T432* %loadaddr, %T432* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'direct':
+  %v0 = load %T432, %T432* %loadaddr
+; ASM: vld1.64
+  %v1 = load %T432, %T432* %loadaddr2
+; ASM: vld1.64
+  %r3 = mul %T432 %v0, %v1 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmul.i32
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @ups1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'ups1632':
+  %v0 = load %T416, %T416* %loadaddr
+; ASM: vldr
+  %v1 = load %T416, %T416* %loadaddr2
+; ASM: vldr
+  %r1 = sext %T416 %v0 to %T432
+  %r2 = sext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
+  %r3 = mul %T432 %r1, %r2 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.s16
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @upu1632(%T416* %loadaddr, %T416* %loadaddr2, %T432* %storeaddr) {
+; COST: function 'upu1632':
+  %v0 = load %T416, %T416* %loadaddr
+; ASM: vldr
+  %v1 = load %T416, %T416* %loadaddr2
+; ASM: vldr
+  %r1 = zext %T416 %v0 to %T432
+  %r2 = zext %T416 %v1 to %T432
+; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
+  %r3 = mul %T432 %r1, %r2 
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+; ASM: vmull.u16
+  store %T432 %r3, %T432* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @ups3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'ups3264':
+  %v0 = load %T232, %T232* %loadaddr
+; ASM: vldr
+  %v1 = load %T232, %T232* %loadaddr2
+; ASM: vldr
+  %r3 = mul %T232 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+  %st = sext %T232 %r3 to %T264
+; ASM: vmovl.s32
+; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
+  store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @upu3264(%T232* %loadaddr, %T232* %loadaddr2, %T264* %storeaddr) {
+; COST: function 'upu3264':
+  %v0 = load %T232, %T232* %loadaddr
+; ASM: vldr
+  %v1 = load %T232, %T232* %loadaddr2
+; ASM: vldr
+  %r3 = mul %T232 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 1 for instruction: {{.*}} mul <2 x i32>
+  %st = zext %T232 %r3 to %T264
+; ASM: vmovl.u32
+; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
+  store %T264 %st, %T264* %storeaddr
+; ASM: vst1.64
+  ret void
+}
+
+define void @dn3216(%T432* %loadaddr, %T432* %loadaddr2, %T416* %storeaddr) {
+; COST: function 'dn3216':
+  %v0 = load %T432, %T432* %loadaddr
+; ASM: vld1.64
+  %v1 = load %T432, %T432* %loadaddr2
+; ASM: vld1.64
+  %r3 = mul %T432 %v0, %v1 
+; ASM: vmul.i32
+; COST: cost of 2 for instruction: {{.*}} mul <4 x i32>
+  %st = trunc %T432 %r3 to %T416
+; ASM: vmovn.i32
+; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
+  store %T416 %st, %T416* %storeaddr
+; ASM: vstr
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
new file mode 100644
index 00000000000..a1cf4b318f3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv8-unknown-unknown -mcpu=cortex-a53 -S | FileCheck %s
+
+; This test is reduced from SPECFP 2006 482.sphinx.
+; We expect vectorization with <2 x double> and <2 x float> ops.
+; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details.
+
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+@a = external global i32
+@v = external global i32
+@mm = external global float**
+@vv = external global float**
+@ll = external global float*
+
+define i32 @test(float* nocapture readonly %x) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T:%.*]] = load i32, i32* @v, align 8
+; CHECK-NEXT:    [[T1:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[OUTERLOOP:%.*]]
+; CHECK:       outerloop:
+; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[V17:%.*]], [[OUTEREND:%.*]] ], [ [[T1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[J_0136:%.*]] = phi i32 [ [[INC144:%.*]], [[OUTEREND]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[SCORE_1135:%.*]] = phi i32 [ [[CALL142:%.*]], [[OUTEREND]] ], [ -939524096, [[ENTRY]] ]
+; CHECK-NEXT:    [[T3:%.*]] = load float**, float*** @mm, align 4
+; CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds float*, float** [[T3]], i32 [[T2]]
+; CHECK-NEXT:    [[T4:%.*]] = load float*, float** [[ARRAYIDX109]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = load float**, float*** @vv, align 4
+; CHECK-NEXT:    [[ARRAYIDX111:%.*]] = getelementptr inbounds float*, float** [[T5]], i32 [[T2]]
+; CHECK-NEXT:    [[T6:%.*]] = load float*, float** [[ARRAYIDX111]], align 4
+; CHECK-NEXT:    [[T7:%.*]] = load float*, float** @ll, align 4
+; CHECK-NEXT:    [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]]
+; CHECK-NEXT:    [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4
+; CHECK-NEXT:    [[CONV114:%.*]] = fpext float [[T8]] to double
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[T]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[T]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[CONV114]], i32 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[INNERLOOP:%.*]]
+; CHECK:       innerloop:
+; CHECK-NEXT:    [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ]
+; CHECK-NEXT:    [[DVAL1_4131:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUB127:%.*]], [[INNERLOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds float, float* [[X]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T9:%.*]] = load float, float* [[ARRAYIDX119]], align 4
+; CHECK-NEXT:    [[ARRAYIDX120:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T10:%.*]] = load float, float* [[ARRAYIDX120]], align 4
+; CHECK-NEXT:    [[SUB121:%.*]] = fsub fast float [[T9]], [[T10]]
+; CHECK-NEXT:    [[CONV122:%.*]] = fpext float [[SUB121]] to double
+; CHECK-NEXT:    [[MUL123:%.*]] = fmul fast double [[CONV122]], [[CONV122]]
+; CHECK-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[I_2132]]
+; CHECK-NEXT:    [[T11:%.*]] = load float, float* [[ARRAYIDX124]], align 4
+; CHECK-NEXT:    [[CONV125:%.*]] = fpext float [[T11]] to double
+; CHECK-NEXT:    [[MUL126:%.*]] = fmul fast double [[MUL123]], [[CONV125]]
+; CHECK-NEXT:    [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
+; CHECK-NEXT:    [[INC129]] = add nuw nsw i32 [[I_2132]], 1
+; CHECK-NEXT:    [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
+; CHECK-NEXT:    br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop !2
+; CHECK:       outerend:
+; CHECK-NEXT:    [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32
+; CHECK-NEXT:    [[CALL142]] = add nuw nsw i32 [[SCORE_1135]], [[CONV138]]
+; CHECK-NEXT:    [[INC144]] = add nuw nsw i32 [[J_0136]], 1
+; CHECK-NEXT:    [[ARRAYIDX102:%.*]] = getelementptr inbounds i32, i32* @a, i32 [[INC144]]
+; CHECK-NEXT:    [[V17]] = load i32, i32* [[ARRAYIDX102]], align 4
+; CHECK-NEXT:    [[CMP103:%.*]] = icmp sgt i32 [[V17]], -1
+; CHECK-NEXT:    br i1 [[CMP103]], label [[OUTERLOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 [[CALL142]]
+;
+entry:
+  %t = load i32, i32* @v, align 8
+  %t1 = load i32, i32* @a, align 4
+  br label %outerloop
+
+outerloop:
+  %t2 = phi i32 [ %v17, %outerend ], [ %t1, %entry ]
+  %j.0136 = phi i32 [ %inc144, %outerend ], [ 0, %entry ]
+  %score.1135 = phi i32 [ %call142, %outerend ], [ -939524096, %entry ]
+  %t3 = load float**, float*** @mm, align 4
+  %arrayidx109 = getelementptr inbounds float*, float** %t3, i32 %t2
+  %t4 = load float*, float** %arrayidx109, align 4
+  %t5 = load float**, float*** @vv, align 4
+  %arrayidx111 = getelementptr inbounds float*, float** %t5, i32 %t2
+  %t6 = load float*, float** %arrayidx111, align 4
+  %t7 = load float*, float** @ll, align 4
+  %arrayidx113 = getelementptr inbounds float, float* %t7, i32 %t2
+  %t8 = load float, float* %arrayidx113, align 4
+  %conv114 = fpext float %t8 to double
+  br label %innerloop
+
+innerloop:
+  %i.2132 = phi i32 [ 0, %outerloop ], [ %inc129, %innerloop ]
+  %dval1.4131 = phi double [ %conv114, %outerloop ], [ %sub127, %innerloop ]
+  %arrayidx119 = getelementptr inbounds float, float* %x, i32 %i.2132
+  %t9 = load float, float* %arrayidx119, align 4
+  %arrayidx120 = getelementptr inbounds float, float* %t4, i32 %i.2132
+  %t10 = load float, float* %arrayidx120, align 4
+  %sub121 = fsub fast float %t9, %t10
+  %conv122 = fpext float %sub121 to double
+  %mul123 = fmul fast double %conv122, %conv122
+  %arrayidx124 = getelementptr inbounds float, float* %t6, i32 %i.2132
+  %t11 = load float, float* %arrayidx124, align 4
+  %conv125 = fpext float %t11 to double
+  %mul126 = fmul fast double %mul123, %conv125
+  %sub127 = fsub fast double %dval1.4131, %mul126
+  %inc129 = add nuw nsw i32 %i.2132, 1
+  %exitcond143 = icmp eq i32 %inc129, %t
+  br i1 %exitcond143, label %outerend, label %innerloop
+
+outerend:
+  %sub127.lcssa = phi double [ %sub127, %innerloop ]
+  %conv138 = fptosi double %sub127.lcssa to i32
+  %call142 = add nuw nsw i32 %score.1135, %conv138
+  %inc144 = add nuw nsw i32 %j.0136, 1
+  %arrayidx102 = getelementptr inbounds i32, i32* @a, i32 %inc144
+  %v17 = load i32, i32* %arrayidx102, align 4
+  %cmp103 = icmp sgt i32 %v17, -1
+  br i1 %cmp103, label %outerloop, label %exit
+
+exit:
+  ret i32 %call142
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
new file mode 100644
index 00000000000..3be22d708da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/vector_cast.ll
@@ -0,0 +1,37 @@
+; RUN: opt -loop-vectorize -tbaa -S -mattr=+neon < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabi"
+
+; This requires the loop vectorizer to create an interleaved access group
+; for the stores to the struct. Here we need to perform a bitcast from a vector
+; of pointers to a vector i32s.
+
+%class.A = type { i8*, i32 }
+
+; CHECK-LABEL: test0
+define void @test0(%class.A* %StartPtr, %class.A* %APtr) {
+entry:
+  br label %for.body.i
+
+for.body.i:
+  %addr = phi %class.A* [ %StartPtr, %entry ], [ %incdec.ptr.i, %for.body.i ]
+  %Data.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 0
+  store i8* null, i8** %Data.i.i, align 4, !tbaa !8
+  %Length.i.i = getelementptr inbounds %class.A, %class.A* %addr, i32 0, i32 1
+  store i32 0, i32* %Length.i.i, align 4, !tbaa !11
+  %incdec.ptr.i = getelementptr inbounds %class.A, %class.A* %addr, i32 1
+  %cmp.i = icmp eq %class.A* %incdec.ptr.i, %APtr
+  br i1 %cmp.i, label %exit, label %for.body.i
+
+exit:
+  ret void
+}
+
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !5, i64 0}
+!9 = !{!"some struct", !5, i64 0, !10, i64 4}
+!10 = !{!"int", !6, i64 0}
+!11 = !{!9, !10, i64 4}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
new file mode 100644
index 00000000000..66d2556dfb8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/width-detect.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK:foo_F32
+;CHECK: <4 x float>
+;CHECK:ret
+define float @foo_F32(float* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.01 = phi float [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
+  %2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %3 = load float, float* %2, align 8
+  %4 = fmul fast float %prod.01, %3
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi float [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
+  ret float %prod.0.lcssa
+}
+
+;CHECK:foo_I8
+;CHECK: xor <16 x i8>
+;CHECK:ret
+define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %3 = load i8, i8* %2, align 1
+  %4 = xor i8 %3, %red.01
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
+  ret i8 %red.0.lcssa
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg
new file mode 100644
index 00000000000..cc6a7edf05f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/Hexagon/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'Hexagon' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll
new file mode 100644
index 00000000000..ad44356448b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll
@@ -0,0 +1,173 @@
+; RUN: opt -march=hexagon -loop-vectorize -hexagon-autohvx -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Check that TTI::getMinimumVF works. The calculated MaxVF was based on the
+; register pressure and was less than 64.
+; CHECK: LV: Overriding calculated MaxVF({{[0-9]+}}) with target's minimum: 64
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+%s.0 = type { i8*, i32, i32, i32, i32 }
+
+@g0 = external dso_local local_unnamed_addr global %s.0**, align 4
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+define hidden fastcc void @f0(i8* nocapture %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i8 zeroext %a5) unnamed_addr #1 {
+b0:
+  %v0 = alloca [4 x [9 x i16]], align 8
+  %v1 = bitcast [4 x [9 x i16]]* %v0 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 72, i8* nonnull %v1) #2
+  %v2 = add i32 %a1, -2
+  %v3 = add i32 %a3, -9
+  %v4 = icmp ugt i32 %v2, %v3
+  %v5 = add i32 %a2, -2
+  %v6 = add i32 %a4, -9
+  %v7 = icmp ugt i32 %v5, %v6
+  %v8 = or i1 %v4, %v7
+  %v9 = load %s.0**, %s.0*** @g0, align 4, !tbaa !1
+  %v10 = zext i8 %a5 to i32
+  %v11 = getelementptr inbounds %s.0*, %s.0** %v9, i32 %v10
+  %v12 = load %s.0*, %s.0** %v11, align 4, !tbaa !1
+  %v13 = getelementptr inbounds %s.0, %s.0* %v12, i32 0, i32 0
+  %v14 = load i8*, i8** %v13, align 4, !tbaa !5
+  br i1 %v8, label %b1, label %b2
+
+b1:                                               ; preds = %b1, %b0
+  %v15 = phi i32 [ 0, %b0 ], [ %v119, %b1 ]
+  %v16 = add i32 %v5, %v15
+  %v17 = icmp slt i32 %v16, 0
+  %v18 = icmp slt i32 %v16, %a4
+  %v19 = select i1 %v18, i32 %v16, i32 %v3
+  %v20 = select i1 %v17, i32 0, i32 %v19
+  %v21 = mul i32 %v20, %a3
+  %v22 = add i32 97, %v21
+  %v23 = getelementptr inbounds i8, i8* %v14, i32 %v22
+  %v24 = load i8, i8* %v23, align 1, !tbaa !8
+  %v25 = zext i8 %v24 to i32
+  %v26 = add i32 101, %v21
+  %v27 = getelementptr inbounds i8, i8* %v14, i32 %v26
+  %v28 = load i8, i8* %v27, align 1, !tbaa !8
+  %v29 = zext i8 %v28 to i32
+  %v30 = mul nsw i32 %v29, -5
+  %v31 = add nsw i32 %v30, %v25
+  %v32 = add i32 106, %v21
+  %v33 = getelementptr inbounds i8, i8* %v14, i32 %v32
+  %v34 = load i8, i8* %v33, align 1, !tbaa !8
+  %v35 = zext i8 %v34 to i32
+  %v36 = mul nuw nsw i32 %v35, 20
+  %v37 = add nsw i32 %v36, %v31
+  %v38 = add i32 111, %v21
+  %v39 = getelementptr inbounds i8, i8* %v14, i32 %v38
+  %v40 = load i8, i8* %v39, align 1, !tbaa !8
+  %v41 = zext i8 %v40 to i32
+  %v42 = mul nuw nsw i32 %v41, 20
+  %v43 = add nsw i32 %v42, %v37
+  %v44 = add i32 116, %v21
+  %v45 = getelementptr inbounds i8, i8* %v14, i32 %v44
+  %v46 = load i8, i8* %v45, align 1, !tbaa !8
+  %v47 = zext i8 %v46 to i32
+  %v48 = mul nsw i32 %v47, -5
+  %v49 = add nsw i32 %v48, %v43
+  %v50 = add i32 120, %v21
+  %v51 = getelementptr inbounds i8, i8* %v14, i32 %v50
+  %v52 = load i8, i8* %v51, align 1, !tbaa !8
+  %v53 = zext i8 %v52 to i32
+  %v54 = add nsw i32 %v49, %v53
+  %v55 = trunc i32 %v54 to i16
+  %v56 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 0, i32 %v15
+  store i16 %v55, i16* %v56, align 2, !tbaa !9
+  %v57 = mul nsw i32 %v35, -5
+  %v58 = add nsw i32 %v57, %v29
+  %v59 = add nsw i32 %v42, %v58
+  %v60 = mul nuw nsw i32 %v47, 20
+  %v61 = add nsw i32 %v60, %v59
+  %v62 = mul nsw i32 %v53, -5
+  %v63 = add nsw i32 %v62, %v61
+  %v64 = add i32 125, %v21
+  %v65 = getelementptr inbounds i8, i8* %v14, i32 %v64
+  %v66 = load i8, i8* %v65, align 1, !tbaa !8
+  %v67 = zext i8 %v66 to i32
+  %v68 = add nsw i32 %v63, %v67
+  %v69 = trunc i32 %v68 to i16
+  %v70 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 1, i32 %v15
+  store i16 %v69, i16* %v70, align 2, !tbaa !9
+  %v71 = mul nsw i32 %v41, -5
+  %v72 = add nsw i32 %v71, %v35
+  %v73 = add nsw i32 %v60, %v72
+  %v74 = mul nuw nsw i32 %v53, 20
+  %v75 = add nsw i32 %v74, %v73
+  %v76 = mul nsw i32 %v67, -5
+  %v77 = add nsw i32 %v76, %v75
+  %v78 = add i32 130, %v21
+  %v79 = getelementptr inbounds i8, i8* %v14, i32 %v78
+  %v80 = load i8, i8* %v79, align 1, !tbaa !8
+  %v81 = zext i8 %v80 to i32
+  %v82 = add nsw i32 %v77, %v81
+  %v83 = trunc i32 %v82 to i16
+  %v84 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 2, i32 %v15
+  store i16 %v83, i16* %v84, align 2, !tbaa !9
+  %v85 = add i32 92, %v21
+  %v86 = getelementptr inbounds i8, i8* %v14, i32 %v85
+  %v87 = load i8, i8* %v86, align 1, !tbaa !8
+  %v88 = zext i8 %v87 to i16
+  %v89 = add i32 135, %v21
+  %v90 = getelementptr inbounds i8, i8* %v14, i32 %v89
+  %v91 = load i8, i8* %v90, align 1, !tbaa !8
+  %v92 = zext i8 %v91 to i16
+  %v93 = mul nsw i16 %v92, -5
+  %v94 = add nsw i16 %v93, %v88
+  %v95 = add i32 140, %v21
+  %v96 = getelementptr inbounds i8, i8* %v14, i32 %v95
+  %v97 = load i8, i8* %v96, align 1, !tbaa !8
+  %v98 = zext i8 %v97 to i16
+  %v99 = mul nuw nsw i16 %v98, 20
+  %v100 = add nsw i16 %v99, %v94
+  %v101 = add i32 145, %v21
+  %v102 = getelementptr inbounds i8, i8* %v14, i32 %v101
+  %v103 = load i8, i8* %v102, align 1, !tbaa !8
+  %v104 = zext i8 %v103 to i16
+  %v105 = mul nuw nsw i16 %v104, 20
+  %v106 = add i16 %v105, %v100
+  %v107 = add i32 150, %v21
+  %v108 = getelementptr inbounds i8, i8* %v14, i32 %v107
+  %v109 = load i8, i8* %v108, align 1, !tbaa !8
+  %v110 = zext i8 %v109 to i16
+  %v111 = mul nsw i16 %v110, -5
+  %v112 = add i16 %v111, %v106
+  %v113 = add i32 154, %v21
+  %v114 = getelementptr inbounds i8, i8* %v14, i32 %v113
+  %v115 = load i8, i8* %v114, align 1, !tbaa !8
+  %v116 = zext i8 %v115 to i16
+  %v117 = add i16 %v112, %v116
+  %v118 = getelementptr inbounds [4 x [9 x i16]], [4 x [9 x i16]]* %v0, i32 0, i32 3, i32 %v15
+  store i16 %v117, i16* %v118, align 2, !tbaa !9
+  %v119 = add nuw nsw i32 %v15, 1
+  %v120 = icmp eq i32 %v119, 19
+  br i1 %v120, label %b2, label %b1
+
+b2:                                               ; preds = %b1, %b0
+  call void @llvm.lifetime.end.p0i8(i64 72, i8* nonnull %v1) #2
+  ret void
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b,+hvxv60" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"any pointer", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !2, i64 0}
+!6 = !{!"", !2, i64 0, !7, i64 4, !7, i64 8, !7, i64 12, !7, i64 16}
+!7 = !{!"int", !3, i64 0}
+!8 = !{!3, !3, i64 0}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"short", !3, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll
new file mode 100644
index 00000000000..3491e08bbaa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/agg-interleave-a2.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %c) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @foo
+; CHECK: fmul <4 x double> %{{[^,]+}}, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
+; CHECK-NEXT: fmul <4 x double> %{{[^,]+}}, <double 2.000000e+00, double 2.000000e+00, double 2.000000e+00, double 2.000000e+00>
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %mul = fmul double %0, 2.000000e+00
+  %mul3 = fmul double %0, %mul
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 %indvars.iv
+  %1 = load double, double* %arrayidx5, align 8
+  %mul6 = fmul double %1, 3.000000e+00
+  %mul9 = fmul double %1, %mul6
+  %add = fadd double %mul3, %mul9
+  %mul12 = fmul double %0, 4.000000e+00
+  %mul15 = fmul double %mul12, %1
+  %add16 = fadd double %mul15, %add
+  %add17 = fadd double %add16, 1.000000e+00
+  %arrayidx19 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add17, double* %arrayidx19, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind "target-cpu"="a2q" }
+
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
new file mode 100644
index 00000000000..c88b496e4e9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; CHECK: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: =
+; CHECK-NOT: fadd
+; CHECK-SAME: >
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define void @QLA_F3_r_veq_norm2_V(float* noalias nocapture %r, [3 x { float, float }]* noalias nocapture readonly %a, i32 signext %n) #0 {
+entry:
+  %cmp24 = icmp sgt i32 %n, 0
+  br i1 %cmp24, label %for.cond1.preheader.preheader, label %for.end13
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.cond1.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond1.preheader ], [ 0, %for.cond1.preheader.preheader ]
+  %sum.026 = phi double [ %add10.2, %for.cond1.preheader ], [ 0.000000e+00, %for.cond1.preheader.preheader ]
+  %arrayidx5.realp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 0
+  %arrayidx5.real = load float, float* %arrayidx5.realp, align 8
+  %arrayidx5.imagp = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 1
+  %arrayidx5.imag = load float, float* %arrayidx5.imagp, align 8
+  %mul = fmul fast float %arrayidx5.real, %arrayidx5.real
+  %mul9 = fmul fast float %arrayidx5.imag, %arrayidx5.imag
+  %add = fadd fast float %mul9, %mul
+  %conv = fpext float %add to double
+  %add10 = fadd fast double %conv, %sum.026
+  %arrayidx5.realp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 0
+  %arrayidx5.real.1 = load float, float* %arrayidx5.realp.1, align 8
+  %arrayidx5.imagp.1 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 1
+  %arrayidx5.imag.1 = load float, float* %arrayidx5.imagp.1, align 8
+  %mul.1 = fmul fast float %arrayidx5.real.1, %arrayidx5.real.1
+  %mul9.1 = fmul fast float %arrayidx5.imag.1, %arrayidx5.imag.1
+  %add.1 = fadd fast float %mul9.1, %mul.1
+  %conv.1 = fpext float %add.1 to double
+  %add10.1 = fadd fast double %conv.1, %add10
+  %arrayidx5.realp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 0
+  %arrayidx5.real.2 = load float, float* %arrayidx5.realp.2, align 8
+  %arrayidx5.imagp.2 = getelementptr inbounds [3 x { float, float }], [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 1
+  %arrayidx5.imag.2 = load float, float* %arrayidx5.imagp.2, align 8
+  %mul.2 = fmul fast float %arrayidx5.real.2, %arrayidx5.real.2
+  %mul9.2 = fmul fast float %arrayidx5.imag.2, %arrayidx5.imag.2
+  %add.2 = fadd fast float %mul9.2, %mul.2
+  %conv.2 = fpext float %add.2 to double
+  %add10.2 = fadd fast double %conv.2, %add10.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.end13_crit_edge, label %for.cond1.preheader
+
+for.cond.for.end13_crit_edge:                     ; preds = %for.cond1.preheader
+  %add10.2.lcssa = phi double [ %add10.2, %for.cond1.preheader ]
+  %phitmp = fptrunc double %add10.2.lcssa to float
+  br label %for.end13
+
+for.end13:                                        ; preds = %for.cond.for.end13_crit_edge, %entry
+  %sum.0.lcssa = phi float [ %phitmp, %for.cond.for.end13_crit_edge ], [ 0.000000e+00, %entry ]
+  store float %sum.0.lcssa, float* %r, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000..5d33887ff0a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
new file mode 100644
index 00000000000..d3cdabd26f5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/pr30990.ll
@@ -0,0 +1,140 @@
+; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s
+
+target triple = "powerpc64-unknown-linux-gnu"
+
+define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64
+  %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i8, i8* %ptr.addr.08, align 1
+  %cmp1 = icmp slt i8 %0, -64
+  %cond = zext i1 %cmp1 to i32
+  %add = add nsw i32 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
+  %cmp = icmp ult i8* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i32 %count.0.lcssa
+
+; CHECK: load <4 x i8>
+; CHECK: icmp slt <4 x i8>
+}
+
+
+define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i16 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i8, i8* %ptr.addr.08, align 1
+  %cmp1 = icmp slt i8 %0, -64 
+  %cond = zext i1 %cmp1 to i16 
+  %add = add nsw i16 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
+  %cmp = icmp ult i8* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i16 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i16 %count.0.lcssa
+
+; CHECK-LABEL: foo2
+; CHECK: load <8 x i8>
+; CHECK: icmp slt <8 x i8>
+}
+
+define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i16, i16* %ptr.addr.16, align 1
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i32 
+  %add = add nsw i32 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
+  %cmp = icmp ult i16* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i32 %count.0.lcssa
+
+; CHECK-LABEL: foo3
+; CHECK: load <4 x i16>
+; CHECK: icmp slt <4 x i16>
+}
+
+define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
+entry:
+  %idx.ext = sext i32 %l to i64 
+  %add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
+  %cmp7 = icmp sgt i32 %l, 0
+  br i1 %cmp7, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %count.09 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
+  %0 = load i16, i16* %ptr.addr.16, align 1
+  %cmp1 = icmp slt i16 %0, -64 
+  %cond = zext i1 %cmp1 to i64 
+  %add = add nsw i64 %cond, %count.09
+  %incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
+  %cmp = icmp ult i16* %incdec.ptr, %add.ptr
+  br i1 %cmp, label %while.body, label %while.end.loopexit
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi i64 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  ret i64 %count.0.lcssa
+
+; CHECK-LABEL: foo4
+; CHECK: load <2 x i16>
+; CHECK: icmp slt <2 x i16>
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
new file mode 100644
index 00000000000..76864bc4629
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; CHECK: vector.body:
+; CHECK: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NEXT: fadd
+; CHECK-NOT: fadd
+; CHECK: middle.block
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-ibm-linux-gnu"
+
+define void @test(double* nocapture readonly %arr, i32 signext %len) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %len, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = add i32 %len, -1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %redx.05 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %arr, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
+  %add = fadd fast double %1, %redx.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %0
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %redx.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
new file mode 100644
index 00000000000..f6f2609e8fb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/stride-vectorization.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @foo
+; CHECK: <2 x double>
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %odd.idx = add nsw i64 %0, 1
+
+  %arrayidx = getelementptr inbounds double, double* %b, i64 %0
+  %arrayidx.odd = getelementptr inbounds double, double* %b, i64 %odd.idx
+
+  %1 = load double, double* %arrayidx, align 8
+  %2 = load double, double* %arrayidx.odd, align 8
+
+  %add = fadd double %1, %2
+  %arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
+  store double %add, double* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { nounwind "target-cpu"="pwr8" }
+
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
new file mode 100644
index 00000000000..8abc25ece35
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: nounwind
+define zeroext i32 @test() #0 {
+; CHECK-LABEL: @test
+; CHECK-NOT: x i32>
+
+entry:
+  %a = alloca [1600 x i32], align 4
+  %c = alloca [1600 x i32], align 4
+  %0 = bitcast [1600 x i32]* %a to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %0) #3
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %1 = bitcast [1600 x i32]* %c to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %1) #3
+  %arraydecay = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 0
+  %arraydecay1 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 0
+  %call = call signext i32 @bar(i32* %arraydecay, i32* %arraydecay1) #3
+  br label %for.body6
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.body ]
+  %arrayidx = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 %indvars.iv25
+  %2 = trunc i64 %indvars.iv25 to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond27 = icmp eq i64 %indvars.iv.next26, 1600
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup5:                                ; preds = %for.body6
+  call void @llvm.lifetime.end(i64 6400, i8* nonnull %1) #3
+  call void @llvm.lifetime.end(i64 6400, i8* %0) #3
+  ret i32 %add
+
+for.body6:                                        ; preds = %for.body6, %for.cond.cleanup
+  %indvars.iv = phi i64 [ 0, %for.cond.cleanup ], [ %indvars.iv.next, %for.body6 ]
+  %s.022 = phi i32 [ 0, %for.cond.cleanup ], [ %add, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx8, align 4
+  %add = add i32 %3, %s.022
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup5, label %for.body6
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare signext i32 @bar(i32*, i32*) #2
+
+attributes #0 = { nounwind "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #3 = { nounwind }
+
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
new file mode 100644
index 00000000000..15aec0d3539
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -mcpu=pwr7 -mattr=+vsx -loop-vectorize -instcombine -S | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.GlobalData = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float], [5 x i32], [12 x i8], [32000 x float], [7 x i32], [4 x i8], [32000 x float], [11 x i32], [4 x i8], [32000 x float], [13 x i32], [12 x i8], [256 x [256 x float]], [17 x i32], [12 x i8], [256 x [256 x float]], [19 x i32], [4 x i8], [256 x [256 x float]], [23 x i32], [4 x i8], [256 x [256 x float]] }
+
+@global_data = external global %struct.GlobalData, align 16
+@ntimes = external hidden unnamed_addr global i32, align 4
+
+define signext i32 @s173() #0 {
+entry:
+  %0 = load i32, i32* @ntimes, align 4
+  %cmp21 = icmp sgt i32 %0, 0
+  br i1 %cmp21, label %for.cond1.preheader, label %for.end12
+
+for.cond1.preheader:                              ; preds = %for.end, %entry
+  %nl.022 = phi i32 [ %inc11, %for.end ], [ 0, %entry ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %indvars.iv
+  %2 = load float, float* %arrayidx5, align 4
+  %add = fadd float %1, %2
+  %3 = add nsw i64 %indvars.iv, 16000
+  %arrayidx8 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %3
+  store float %add, float* %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16000
+  br i1 %exitcond, label %for.end, label %for.body3
+
+for.end:                                          ; preds = %for.body3
+  %inc11 = add nsw i32 %nl.022, 1
+  %4 = load i32, i32* @ntimes, align 4
+  %mul = mul nsw i32 %4, 10
+  %cmp = icmp slt i32 %inc11, %mul
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.end, %entry
+  ret i32 0
+
+; CHECK-LABEL: @s173
+; CHECK: load <4 x float>, <4 x float>*
+; CHECK: add nsw i64 %index, 16000
+; CHECK: ret i32 0
+}
+
+attributes #0 = { nounwind }
+
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
new file mode 100644
index 00000000000..1f7a6d29c57
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S  -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \
+; RUN:   -instcombine -force-vector-width=2  < %s | FileCheck %s
+;
+; Test that loop vectorizer does not generate vector addresses that must then
+; always be extracted.
+
+; Check that the addresses for a scalarized memory access is not extracted
+; from a vector register.
+define i32 @foo(i32* nocapture %A) {
+;CHECK-LABEL: @foo(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = shl nsw i64 %index, 2
+;CHECK:  %1 = shl i64 %index, 2
+;CHECK:  %2 = or i64 %1, 4
+;CHECK:  %3 = getelementptr inbounds i32, i32* %A, i64 %0
+;CHECK:  %4 = getelementptr inbounds i32, i32* %A, i64 %2
+;CHECK:  store i32 4, i32* %3, align 4
+;CHECK:  store i32 4, i32* %4, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  store i32 4, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
+
+; Check that a load of address is scalarized.
+define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) {
+;CHECK-LABEL: @foo1(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = or i64 %index, 1
+;CHECK:  %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index
+;CHECK:  %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0
+;CHECK:  %3 = load i32*, i32** %1, align 8
+;CHECK:  %4 = load i32*, i32** %2, align 8
+;CHECK:  %5 = load i32, i32* %3, align 4
+;CHECK:  %6 = load i32, i32* %4, align 4
+;CHECK:  %7 = insertelement <2 x i32> undef, i32 %5, i32 0
+;CHECK:  %8 = insertelement <2 x i32> %7, i32 %6, i32 1
+;CHECK:  %9 = getelementptr inbounds i32, i32* %A, i64 %index
+;CHECK:  %10 = bitcast i32* %9 to <2 x i32>*
+;CHECK:  store <2 x i32> %8, <2 x i32>* %10, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr = getelementptr inbounds i32*, i32** %PtrPtr, i64 %indvars.iv
+  %el = load i32*, i32** %ptr
+  %v = load i32, i32* %el
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
new file mode 100644
index 00000000000..d2e59452033
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+
+; Check costs for branches inside a vectorized loop around predicated
+; blocks. Each such branch will be guarded with an extractelement from the
+; vector compare plus a test under mask instruction. This cost is modelled on
+; the extractelement of i1.
+
+define void @fun(i32* %arr, i64 %trip.count) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
+  %l = load i32, i32* %arrayidx, align 4
+  %cmp55 = icmp sgt i32 %l, 0
+  br i1 %cmp55, label %if.then, label %for.inc
+
+if.then:
+  %sub = sub nsw i32 0, %l
+  store i32 %sub, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 5 for VF 2 For instruction:   br i1 %cmp55, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond, label %for.end.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg
new file mode 100644
index 00000000000..2f3cf7d3f04
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'SystemZ' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
new file mode 100644
index 00000000000..1925527eacf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-0.ll
@@ -0,0 +1,27 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get operands scalarization costs added.
+
+define void @fun(i64* %data, i64 %n, i64 %s, double* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %mul = mul nsw i64 %iv, %s
+  %gep = getelementptr inbounds double, double* %Src, i64 %mul
+  %bct = bitcast double* %gep to i64*
+  %ld = load i64, i64* %bct
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %mul = mul nsw i64 %iv, %s
+; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction:   %ld = load i64, i64* %bct
+}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
new file mode 100644
index 00000000000..fbf8b114542
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-scalarization-cost-1.ll
@@ -0,0 +1,28 @@
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -enable-interleaved-mem-accesses=false -disable-output < %s 2>&1 \
+; RUN:   | FileCheck %s
+; REQUIRES: asserts
+;
+; Check that a scalarized load does not get a zero cost in a vectorized
+; loop. It can only be folded into the add operand in the scalar loop.
+
+define i32 @fun(i64* %data, i64 %n, i64 %s, i32* %Src) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %acc = phi i32 [ 0, %entry ], [ %acc_next, %for.body ]
+  %gep = getelementptr inbounds i32, i32* %Src, i64 %iv
+  %ld = load i32, i32* %gep
+  %acc_next = add i32 %acc, %ld
+  %iv.next = add nuw nsw i64 %iv, 2
+  %cmp110.us = icmp slt i64 %iv.next, %n
+  br i1 %cmp110.us, label %for.body, label %for.end
+
+for.end:
+  ret i32 %acc_next
+
+; CHECK: Found an estimated cost of 4 for VF 4 For instruction:   %ld = load i32, i32* %gep
+}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
new file mode 100644
index 00000000000..9fdf22ecd92
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
+; RUN:   FileCheck %s
+;
+; Check that a scalarized load/store does not get a cost for insterts/
+; extracts, since z13 supports element load/store.
+
+define void @fun(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Scalarizing:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Scalarizing:  store i32 %tmp2, i32* %tmp0, align 4
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp2, i32* %tmp0, align 4
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
new file mode 100644
index 00000000000..4c992cedd88
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-02.ll
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -debug-only=loop-vectorize,vectorutils -max-interleave-group-factor=64\
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that some cost estimations for interleave groups make sense.
+
+; This loop is loading four i16 values at indices [0, 1, 2, 3], with a stride
+; of 4. At VF=4, memory interleaving means loading 4 * 4 * 16 bits = 2 vector
+; registers. Each of the 4 vector values must then be constructed from the
+; two vector registers using one vperm each, which gives a cost of 2 + 4 = 6.
+;
+; CHECK: LV: Checking a loop in "fun0"
+; CHECK: LV: Found an estimated cost of 6 for VF 4 For instruction:   %ld0 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld1 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld2 = load i16
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %ld3 = load i16
+define void @fun0(i16 *%ptr, i16 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i16* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i16, i16* %ivptr, i64 0
+  %ld0 = load i16, i16* %ptr0
+  %ptr1 = getelementptr inbounds i16, i16* %ivptr, i64 1
+  %ld1 = load i16, i16* %ptr1
+  %ptr2 = getelementptr inbounds i16, i16* %ivptr, i64 2
+  %ld2 = load i16, i16* %ptr2
+  %ptr3 = getelementptr inbounds i16, i16* %ivptr, i64 3
+  %ld3 = load i16, i16* %ptr3
+  %a1 = add i16 %ld0, %ld1
+  %a2 = add i16 %a1, %ld2
+  %a3 = add i16 %a2, %ld3
+  %dstptr = getelementptr inbounds i16, i16* %dst, i64 %iv
+  store i16 %a3, i16* %dstptr
+  %ptr.next = getelementptr inbounds i16, i16* %ivptr, i64 4
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop loads one i8 value in a stride of 3. At VF=16, this means loading
+; 3 vector registers, and then constructing the vector value with two vperms,
+; which gives a cost of 5.
+;
+; CHECK: LV: Checking a loop in "fun1"
+; CHECK: LV: Found an estimated cost of 5 for VF 16 For instruction:   %ld0 = load i8
+define void @fun1(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %ld0, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 32. At VF=2, this means loading 2 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 6.
+;
+; CHECK: LV: Checking a loop in "fun2"
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun2(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 32
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; This loop is loading 4 i8 values at indexes [0, 1, 2, 3], with a stride of
+; 30. At VF=2, this means loading 3 vector registers, and using 4 vperms to
+; produce the vector values, which gives a cost of 7. This is the same loop
+; as in fun2, except the stride makes the second iterations values overlap a
+; vector register boundary.
+;
+; CHECK: LV: Checking a loop in "fun3"
+; CHECK: LV: Found an estimated cost of 7 for VF 2 For instruction:   %ld0 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld1 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld2 = load i8
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %ld3 = load i8
+define void @fun3(i8 *%ptr, i8 *%dst) {
+entry:
+  br label %for.body
+
+for.body:
+  %ivptr = phi i8* [ %ptr.next, %for.body ], [ %ptr, %entry ]
+  %iv = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %inc = add i64 %iv, 4
+  %ptr0 = getelementptr inbounds i8, i8* %ivptr, i64 0
+  %ld0 = load i8, i8* %ptr0
+  %ptr1 = getelementptr inbounds i8, i8* %ivptr, i64 1
+  %ld1 = load i8, i8* %ptr1
+  %ptr2 = getelementptr inbounds i8, i8* %ivptr, i64 2
+  %ld2 = load i8, i8* %ptr2
+  %ptr3 = getelementptr inbounds i8, i8* %ivptr, i64 3
+  %ld3 = load i8, i8* %ptr3
+  %a1 = add i8 %ld0, %ld1
+  %a2 = add i8 %a1, %ld2
+  %a3 = add i8 %a2, %ld3
+  %dstptr = getelementptr inbounds i8, i8* %dst, i64 %iv
+  store i8 %a3, i8* %dstptr
+  %ptr.next = getelementptr inbounds i8, i8* %ivptr, i64 30
+  %cmp = icmp eq i64 %inc, 100
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
new file mode 100644
index 00000000000..4e04a3423ed
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
@@ -0,0 +1,70 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize,vectorutils \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that the loop vectorizer performs memory interleaving with accurate
+; cost estimations.
+
+
+; Simple case where just the load is interleaved, because the store group
+; would have gaps.
+define void @fun0(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+;        (vl; vl; vperm)
+}
+
+; Interleaving of both load and stores.
+define void @fun1(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %i_1  = add i64 %i, 1
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 %i_1
+  %tmp3 = load i32, i32* %tmp2, align 4
+  store i32 %tmp1, i32* %tmp2, align 4
+  store i32 %tmp3, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Inserted:  store i32 %tmp1, i32* %tmp2, align 4
+; CHECK:     into the interleave group with  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Creating an interleave group with:  %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: LV: Inserted:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK:     into the interleave group with  %tmp3 = load i32, i32* %tmp2, align 4
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %tmp3 = load i32, i32* %tmp2, align 4
+;            (vl; vl; vperm, vpkg)
+
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i32 %tmp1, i32* %tmp2, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp3, i32* %tmp0, align 4
+;            (vmrlf; vmrhf; vst; vst)
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr38110.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr38110.ll
new file mode 100644
index 00000000000..6c8fef9fc91
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr38110.ll
@@ -0,0 +1,50 @@
+; RUN: opt -passes='loop-vectorize' -mcpu=z13 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Forcing VF=2 to trigger vector code gen
+;
+; This is a test case to exercise more cases in truncateToMinimalBitWidths().
+; Test passes if vector code is generated w/o hitting llvm_unreachable().
+;
+; Performing minimal check in the output to ensure the loop is actually
+; vectorized.
+;
+; CHECK: vector.body
+
+target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+target triple = "s390x-ibm-linux"
+
+define void @test(i32 zeroext %width, i8* nocapture %row, i16 zeroext %src, i16* nocapture readonly %dst) {
+entry:
+  %cmp10 = icmp eq i32 %width, 0
+  br i1 %cmp10, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = zext i16 %src to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %sp.011 = phi i8* [ %row, %for.body.lr.ph ], [ %incdec.ptr, %for.inc ]
+  %0 = load i8, i8* %sp.011, align 1
+  %conv = zext i8 %0 to i32
+  %cmp2 = icmp eq i32 %conv, %conv1
+  br i1 %cmp2, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %1 = load i16, i16* %dst, align 2
+  %conv4 = trunc i16 %1 to i8
+  store i8 %conv4, i8* %sp.011, align 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw i32 %i.012, 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %sp.011, i64 1
+  %exitcond = icmp eq i32 %inc, %width
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
new file mode 100644
index 00000000000..60dd07698a3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; We want to make sure that we don't even try to vectorize loops again
+; The vectorizer used to mark the un-vectorized loop only as already vectorized
+; thus, trying to vectorize the vectorized loop again
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global [255 x i32]
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @vect() {
+; CHECK: LV: Checking a loop in "vect"
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; We need to make sure we did vectorize the loop
+; CHECK: LV: Found a loop: for.body
+; CHECK: LV: We can vectorize this loop!
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %red.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+; If it did, we have two loops:
+; CHECK: vector.body:
+; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
+; CHECK: for.body:
+; CHECK: br {{.*}} label %for.body, !llvm.loop [[scalar:![0-9]+]]
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+; Now, we check for the Hint metadata
+; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]]}
+; CHECK: [[width]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]]}
+; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/assume.ll b/llvm/test/Transforms/LoopVectorize/X86/assume.ll
new file mode 100644
index 00000000000..4fd378d1a0a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/assume.ll
@@ -0,0 +1,100 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 %cmp1)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
+%struct.data = type { float*, float* }
+
+; Function Attrs: nounwind uwtable
+define void @test2(%struct.data* nocapture readonly %d) #0 {
+entry:
+  %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
+  %0 = load float*, float** %b, align 8
+  %ptrint = ptrtoint float* %0 to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0
+  %1 = load float*, float** %a, align 8
+  %ptrint2 = ptrtoint float* %1 to i64
+  %maskedptr3 = and i64 %ptrint2, 31
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  br label %for.body
+
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
+  %add = fadd float %2, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond4)
+  %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
new file mode 100644
index 00000000000..d384a8162ba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s  -loop-vectorize -mattr=avx,+slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=SLOWMEM32 --check-prefix=CHECK
+; RUN: opt < %s  -loop-vectorize -mattr=avx,-slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=FASTMEM32 --check-prefix=CHECK
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: @read_mod_write_single_ptr(
+; CHECK: load <8 x float>
+; CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+; CHECK-LABEL: @read_mod_i64(
+; SLOWMEM32: load <2 x i64>
+; FASTMEM32: load <4 x i64>
+; CHECK: ret i32
+define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 4
+  %4 = add i64 %3, 3
+  store i64 %4, i64* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
new file mode 100644
index 00000000000..0917e007224
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -0,0 +1,112 @@
+; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s
+; RUN: opt -mattr=+avx512vl,+prefer-256-bit --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s --check-prefix=CHECK-PREFER-AVX256
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Verify that we generate 512-bit wide vectors for a basic integer memset
+; loop.
+
+; CHECK-LABEL: f:
+; CHECK: vmovdqu64 %zmm{{.}},
+; CHECK-NOT: %ymm
+
+; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to
+
+; CHECK-PREFER-AVX256-LABEL: f:
+; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
+; CHECK-PREFER-AVX256-NOT: %zmm
+
+define void @f(i32* %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Verify that the "prefer-vector-width=256" attribute prevents the use of 512-bit
+; vectors
+
+; CHECK-LABEL: g:
+; CHECK: vmovdqu %ymm{{.}},
+; CHECK-NOT: %zmm
+
+; CHECK-PREFER-AVX256-LABEL: g:
+; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
+; CHECK-PREFER-AVX256-NOT: %zmm
+
+define void @g(i32* %a, i32 %n) "prefer-vector-width"="256" {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Verify that the "prefer-vector-width=512" attribute override the subtarget
+; vectors
+
+; CHECK-LABEL: h:
+; CHECK: vmovdqu64 %zmm{{.}},
+; CHECK-NOT: %ymm
+
+; CHECK-PREFER-AVX256-LABEL: h:
+; CHECK-PREFER-AVX256: vmovdqu64 %zmm{{.}},
+; CHECK-PREFER-AVX256-NOT: %ymm
+
+define void @h(i32* %a, i32 %n) "prefer-vector-width"="512" {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
new file mode 100644
index 00000000000..f944d1f97da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
@@ -0,0 +1,108 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR34965/D39346
+
+; LV retains the original scalar loop intact as remainder loop. However,
+; after this transformation, analysis information concerning the remainder
+; loop may differ from the original scalar loop. This test is an example of
+; that behaviour, where values inside the remainder loop which SCEV could
+; originally analyze now require flow-sensitive analysis currently not
+; supported in SCEV. In particular, during LV code generation, after turning
+; the original scalar loop into the remainder loop, LV expected
+; Legal->isConsecutivePtr() to be consistent and return the same output as
+; during legal/cost model phases (original scalar loop). Unfortunately, that
+; condition was not satisfied because of the aforementioned SCEV limitation.
+; After D39346, LV code generation doesn't rely on Legal->isConsecutivePtr(),
+; i.e., SCEV. This test verifies that LV is able to handle the described cases.
+;
+; TODO: The SCEV limitation described before may affect plans to further
+; optimize the remainder loop of this particular test case. One tentative
+; solution is to detect the problematic IVs in LV (%7 and %8) and perform an
+; in-place IV optimization by replacing:
+;   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] with
+; with
+;   %8 = sub i32 %7, 1.
+
+
+; Verify that store is vectorized as stride-1 memory access.
+
+; CHECK-LABEL: @test_01(
+; CHECK-NOT: vector.body:
+
+; This test was originally vectorized, but now SCEV is smart enough to prove
+; that its trip count is 1, so it gets ignored by vectorizer.
+; Function Attrs: uwtable
+define void @test_01() {
+  br label %.outer
+
+; <label>:1:                                      ; preds = %2
+  ret void
+
+; <label>:2:                                      ; preds = %._crit_edge.loopexit
+  %3 = add nsw i32 %.ph, -2
+  br i1 undef, label %1, label %.outer
+
+.outer:                                           ; preds = %2, %0
+  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
+  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
+  %4 = and i32 %.ph, 30
+  %5 = add i32 %.ph2, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %6, %.outer
+  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
+  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
+  %9 = add i32 %8, 2
+  %10 = zext i32 %9 to i64
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
+  %12 = ashr i32 undef, %4
+  store i32 %12, i32 addrspace(1)* %11, align 4
+  %13 = add i32 %7, 1
+  %14 = icmp sgt i32 %13, 61
+  br i1 %14, label %._crit_edge.loopexit, label %6
+
+._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
+  br i1 undef, label %2, label %._crit_edge.loopexit
+}
+
+; After trip count is increased, the test gets vectorized.
+; CHECK-LABEL: @test_02(
+; CHECK: vector.body:
+; CHECK: store <4 x i32>
+
+; Function Attrs: uwtable
+define void @test_02() {
+  br label %.outer
+
+; <label>:1:                                      ; preds = %2
+  ret void
+
+; <label>:2:                                      ; preds = %._crit_edge.loopexit
+  %3 = add nsw i32 %.ph, -2
+  br i1 undef, label %1, label %.outer
+
+.outer:                                           ; preds = %2, %0
+  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
+  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
+  %4 = and i32 %.ph, 30
+  %5 = add i32 %.ph2, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %6, %.outer
+  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
+  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
+  %9 = add i32 %8, 2
+  %10 = zext i32 %9 to i64
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
+  %12 = ashr i32 undef, %4
+  store i32 %12, i32 addrspace(1)* %11, align 4
+  %13 = add i32 %7, 1
+  %14 = icmp sgt i32 %13, 610
+  br i1 %14, label %._crit_edge.loopexit, label %6
+
+._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
+  br i1 undef, label %2, label %._crit_edge.loopexit
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
new file mode 100644
index 00000000000..e18159f2462
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -0,0 +1,67 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: PR31671
+;
+; Check a pointer in which one of its uses is consecutive-like and another of
+; its uses is non-consecutive-like. In the test case below, %tmp3 is the
+; pointer operand of an interleaved load, making it consecutive-like. However,
+; it is also the pointer operand of a non-interleaved store that will become a
+; scatter operation. %tmp3 (and the induction variable) should not be marked
+; uniform-after-vectorization.
+;
+; CHECK:       LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NOT:   LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
+
+define void @PR31671(float %x, %data* %d) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fmul float %x, %tmp1
+  %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+  %tmp4 = load float, float* %tmp3, align 4
+  %tmp5 = fadd float %tmp4, %tmp2
+  store float %tmp5, float* %tmp3, align 4
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, 32000
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="knl" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
new file mode 100644
index 00000000000..a73d91ba30b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mtriple=x86_64-- -o - %s | FileCheck %s
+
+; Testcase that verify that we don't get a faulty bitcast that cast between
+; different sizes.
+
+%rec8 = type { i16 }
+
+@a = global [1 x %rec8] zeroinitializer
+@b = global [2 x i16*] zeroinitializer
+
+
+define void @f1() {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], <i16 0, i16 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
+; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+
+bb1:
+  br label %bb2
+
+bb2:
+  %c.1.0 = phi i16 [ 0, %bb1 ], [ %_tmp9, %bb2 ]
+  %_tmp1 = zext i16 0 to i64
+  %_tmp2 = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 %_tmp1
+  %_tmp4 = bitcast %rec8* %_tmp2 to i16*
+  %_tmp6 = sext i16 %c.1.0 to i64
+  %_tmp7 = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 %_tmp6
+  store i16* %_tmp4, i16** %_tmp7
+  %_tmp9 = add nsw i16 %c.1.0, 1
+  %_tmp11 = icmp slt i16 %_tmp9, 2
+  br i1 %_tmp11, label %bb2, label %bb3
+
+bb3:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
new file mode 100644
index 00000000000..d75b1d940ae
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
@@ -0,0 +1,30 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize -dce -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+@A = common global [1024 x i32] zeroinitializer, align 16
+
+; We use to not vectorize this loop because the shift was deemed to expensive.
+; Now that we differentiate shift cost base on the operand value kind, we will
+; vectorize this loop.
+; CHECK: ashr <4 x i32>
+define void @f() {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %shl = ashr i32 %0, 3
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  store i32 %shl, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
new file mode 100644
index 00000000000..eb2a2a56fae
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK-LABEL: @conversion_cost1(
+;CHECK: store <32 x i8>
+;CHECK: ret
+define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 3
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
+  %2 = trunc i64 %indvars.iv to i8
+  %3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  store i8 %2, i8* %3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+;CHECK-LABEL: @conversion_cost2(
+;CHECK: <2 x float>
+;CHECK: ret
+define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %add = add nsw i64 %indvars.iv, 3
+  %tofp = sitofp i64 %add to float
+  %gep = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %tofp, float* %gep, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
new file mode 100644
index 00000000000..0ee2660ed31
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -0,0 +1,82 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@c = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; The program below gathers and scatters data. We better not vectorize it.
+;CHECK-LABEL: @cost_model_1(
+;CHECK-NOT: <2 x i32>
+;CHECK-NOT: <4 x i32>
+;CHECK-NOT: <8 x i32>
+;CHECK: ret void
+define void @cost_model_1() nounwind uwtable noinline ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 8
+  %idxprom1 = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx4, align 4
+  %idxprom5 = sext i32 %3 to i64
+  %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %idxprom5
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; This function uses a stride that is generally too big to benefit from vectorization without
+; really good support for a gather load. We were not computing an accurate cost for the 
+; vectorization and subsequent scalarization of the pointer induction variables.
+
+define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
+; CHECK-LABEL: @PR27826(
+; CHECK-NOT:   <4 x float> 
+; CHECK-NOT:   <8 x float> 
+; CHECK:       ret float %s.0.lcssa
+
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %preheader, label %for.end
+
+preheader:
+  %t0 = sext i32 %n to i64
+  br label %for
+
+for:
+  %indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for ]
+  %s.02 = phi float [ 0.0, %preheader ], [ %add4, %for ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %t1 = load float, float* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %t2 = load float, float* %arrayidx3, align 4
+  %add = fadd fast float %t1, %s.02
+  %add4 = fadd fast float %add, %t2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
+  %cmp1 = icmp slt i64 %indvars.iv.next, %t0
+  br i1 %cmp1, label %for, label %loopexit
+
+loopexit:
+  %add4.lcssa = phi float [ %add4, %for ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add4.lcssa, %loopexit ]
+  ret float %s.0.lcssa
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
new file mode 100644
index 00000000000..af4a48d3a22
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -0,0 +1,149 @@
+; RUN: opt < %s  -O3 -simplifycfg -keep-loops=false -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s
+
+; This test checks auto-vectorization with FP induction variable.
+; The FP operation is not "fast" and requires "fast-math" function attribute.
+
+;void fp_iv_loop1(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+
+; AUTO_VEC-LABEL: @fp_iv_loop1(
+; AUTO_VEC: vector.body
+; AUTO_VEC: store <8 x float>
+
+define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; The same as the previous, FP operation is not fast, different function attribute
+; Vectorization should be rejected.
+;void fp_iv_loop2(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; AUTO_VEC-LABEL: @fp_iv_loop2(
+; AUTO_VEC-NOT: vector.body
+; AUTO_VEC-NOT: store <{{.*}} x float>
+
+define void @fp_iv_loop2(float* noalias nocapture %A, i32 %N) #1 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; AUTO_VEC-LABEL: @external_use_with_fast_math(
+; AUTO_VEC-NEXT:  entry:
+; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp sgt i64 %n, 1
+; AUTO_VEC-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.ph
+; AUTO_VEC:       vector.ph:
+; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; AUTO_VEC:         br label %vector.body
+; AUTO_VEC:       middle.block:
+; AUTO_VEC:         [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1
+; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double
+; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00
+; AUTO_VEC-NEXT:    br i1 {{.*}}, label %for.end, label %for.body
+; AUTO_VEC:       for.end:
+; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[TMP12]], %middle.block ], [ %j, %for.body ]
+; AUTO_VEC-NEXT:    ret double [[J_LCSSA]]
+;
+define double @external_use_with_fast_math(double* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
+  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
+  %tmp0 = getelementptr double, double* %a, i64 %i
+  store double %j, double* %tmp0
+  %i.next = add i64 %i, 1
+  %j.next = fadd fast double %j, 3.0
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp1 = phi double [ %j, %for.body ]
+  ret double %tmp1
+}
+
+; AUTO_VEC-LABEL: @external_use_without_fast_math(
+; AUTO_VEC:       for.body:
+; AUTO_VEC:         [[J:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[J_NEXT:%.*]], %for.body ]
+; AUTO_VEC:         [[J_NEXT]] = fadd double [[J]], 3.000000e+00
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %for.end
+; AUTO_VEC:       for.end:
+; AUTO_VEC-NEXT:    ret double [[J]]
+;
+define double @external_use_without_fast_math(double* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
+  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
+  %tmp0 = getelementptr double, double* %a, i64 %i
+  store double %j, double* %tmp0
+  %i.next = add i64 %i, 1
+  %j.next = fadd double %j, 3.0
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp1 = phi double [ %j, %for.body ]
+  ret double %tmp1
+}
+
+attributes #0 = { "no-nans-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll b/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
new file mode 100644
index 00000000000..07b98b4fb00
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
@@ -0,0 +1,42 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @Test
+; CHECK: <4 x i32>
+
+for.body:                                         ; preds = %cond.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !1
+  %cmp1 = icmp eq i32 %0, 0
+  %arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx3, align 4, !llvm.access.group !1
+  br i1 %cmp1, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %for.body
+  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx7, align 4, !llvm.access.group !1
+  %add = add nsw i32 %2, %1
+  br label %cond.end
+
+cond.end:                                         ; preds = %for.body, %cond.false
+  %cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ]
+  store i32 %cond, i32* %arrayidx3, align 4, !llvm.access.group !1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %cond.end
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+
+!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
+!1 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
new file mode 100644
index 00000000000..4a56d6b5ebf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+@float_array = common global [10000 x float] zeroinitializer, align 16
+@unsigned_array = common global [10000 x i32] zeroinitializer, align 16
+
+; If we need to scalarize the fptoui and then use inserts to build up the
+; vector again, then there is certainly no value in going 256-bit wide.
+; CHECK-NOT: vinserti128
+
+define void @convert(i32 %N) {
+entry:
+  %0 = icmp eq i32 %N, 0
+  br i1 %0, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [10000 x float], [10000 x float]* @float_array, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %1 to i32
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
new file mode 100644
index 00000000000..c066afcfa63
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+@n = global i32 10000, align 4
+@double_array = common global [10000 x double] zeroinitializer, align 16
+@unsigned_array = common global [10000 x i32] zeroinitializer, align 16
+
+; If we need to scalarize the fptoui and then use inserts to build up the
+; vector again, then there is certainly no value in going 256-bit wide.
+; CHECK-NOT: vpinsrd
+
+define void @convert() {
+entry:
+  %0 = load i32, i32* @n, align 4
+  %cmp4 = icmp eq i32 %0, 0
+  br i1 %cmp4, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [10000 x double], [10000 x double]* @double_array, i64 0, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %1 to i32
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp ult i32 %2, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
new file mode 100644
index 00000000000..b3a0710545d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+; CHECK: cost of 7 for VF 8 For instruction:   %conv = fptosi float %tmp to i8
+define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp = load float, float* %arrayidx, align 4
+  %conv = fptosi float %tmp to i8
+  %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
new file mode 100644
index 00000000000..88f15e7e148
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+define void @test1() #0 personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @_CxxThrowException(i8* null, i8* null)
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  catchret from %1 to label %try.cont
+
+for.body:                                         ; preds = %for.body, %catch
+  %i.07 = phi i32 [ 0, %catch ], [ %inc, %for.body ]
+  %call = call double @floor(double 1.0) #1 [ "funclet"(token %1) ]
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+try.cont:                                         ; preds = %for.cond.cleanup
+  ret void
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
+; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
+
+declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare double @floor(double) #1
+
+attributes #0 = { "target-features"="+sse2" }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
new file mode 100644
index 00000000000..fd4981eb04a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
@@ -0,0 +1,86 @@
+; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@kernel = global [512 x float] zeroinitializer, align 16
+@kernel2 = global [512 x float] zeroinitializer, align 16
+@kernel3 = global [512 x float] zeroinitializer, align 16
+@kernel4 = global [512 x float] zeroinitializer, align 16
+@src_data = global [1536 x float] zeroinitializer, align 16
+@r_ = global i8 0, align 1
+@g_ = global i8 0, align 1
+@b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive. This function represents a point where vectorization starts to
+; become beneficial.
+; Make sure we are conservative and don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+  %cmp53 = icmp eq i64 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i64 %v.055, %offset
+  %mul = mul i64 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float, float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i64 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float, float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i64 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float, float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i64 %v.055, 1
+  %exitcond = icmp ne i64 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 1
+  store i8 %g.0.lcssa, i8* @g_, align 1
+  store i8 %b.0.lcssa, i8* @b_, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
new file mode 100644
index 00000000000..f38782cb7e5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mattr=avx512f  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
new file mode 100644
index 00000000000..21ed2f2fb39
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -0,0 +1,1754 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+; RUN: opt < %s -O3 -mcpu=knl -force-vector-width=2 -S | FileCheck %s -check-prefix=FVW2
+
+; With a force-vector-width, it is sometimes more profitable to generate
+; scalarized and predicated stores instead of masked scatter.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
+;
+;  for (int i=0; i < SIZE; ++i) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[index[i]] + (float) 0.5;
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]])
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64>
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]])
+; AVX512-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64>
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]])
+; AVX512-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4
+; AVX512-NEXT:    [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64>
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]])
+; AVX512-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
+; AVX512-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo1(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
+; FVW2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP5]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP6]], i32 4, <2 x i1> [[TMP2]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP7]], <2 x float>* [[TMP9]], i32 4, <2 x i1> [[TMP2]])
+; FVW2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 2
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64>
+; FVW2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]])
+; FVW2-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; FVW2-NEXT:    [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64>
+; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP27]], <2 x float>* [[TMP29]], i32 4, <2 x i1> [[TMP22]])
+; FVW2-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 6
+; FVW2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP31]], align 4
+; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64>
+; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP37:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP37]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP32]])
+; FVW2-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
+; FVW2-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca float*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %in, float** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %index.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %idxprom4 = sext i32 %6 to i64
+  %7 = load float*, float** %in.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
+  %8 = load float, float* %arrayidx5, align 4
+  %add = fadd float %8, 5.000000e-01
+  %9 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %9 to i64
+  %10 = load float*, float** %out.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; i += 16) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+%struct.In = type { float, float }
+
+define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+; AVX512-LABEL: @foo2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !2
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float*, float** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
+  store float %add, float* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;struct Out {
+;  float a;
+;  float b;
+;};
+;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; i += 16) {
+;    if (trigger[i] > 0) {
+;      out[i].b = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+%struct.Out = type { float, float }
+
+define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
+; AVX512-LABEL: @foo3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo3(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD6]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER9]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD6]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER12]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; FVW2:       pred.store.if16:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP20]], i32 1
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; FVW2:       pred.store.continue17:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; FVW2:       pred.store.if18:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP24]], i32 1
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; FVW2:       pred.store.continue19:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; FVW2:       pred.store.if20:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP28]], i32 1
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; FVW2:       pred.store.continue21:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; FVW2:       pred.store.if22:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP32]], i32 1
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; FVW2:       pred.store.continue23:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; FVW2:       pred.store.if24:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP36]], i32 1
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; FVW2:       pred.store.continue25:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; FVW2:       pred.store.if26:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP40]], i32 1
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; FVW2:       pred.store.continue27:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29]]
+; FVW2:       pred.store.if28:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP44]], i32 1
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; FVW2:       pred.store.continue29:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !3
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca %struct.Out*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store %struct.Out* %out, %struct.Out** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
+  %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
+  store float %add, float* %b6, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+
+; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
+
+define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+; AVX512-LABEL: @foo2_addrspace(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the input has the non-default address space.
+
+define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo2_addrspace2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace2(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(0)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
+  store float %add, float addrspace(0)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the output has the non-default address space.
+
+define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo2_addrspace3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace3(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(0)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(0)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
new file mode 100644
index 00000000000..c581f4bf2a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -0,0 +1,77 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-interleave=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+
+;UNROLL-LABEL: @example1(
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example10b(
+;UNROLL: load <4 x i16>
+;UNROLL: load <4 x i16>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
new file mode 100644
index 00000000000..683e857e5f2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR15794
+; incorrect addition of llvm.mem.parallel_loop_access metadata is undefined
+; behaviour. Vectorizer ignores the memory dependency checks and goes ahead and
+; vectorizes this loop with uniform stores which has an output dependency.
+
+; void foo(int *a, int *b, int k, int m) {
+;   for (int i = 0; i < m; i++) {
+;     for (int j = 0; j < m; j++) {
+;       a[i] = a[i + j + k] + 1; <<<
+;     }
+;     b[i] = b[i] + 3;
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP9]], [[K]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, i1 [[TMP12]], i1 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 false, [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[ADD_US]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP24]], i32 0
+; CHECK-NEXT:    store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP24]], i32 1
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP24]], i32 2
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
+; CHECK-NEXT:    store i32 [[TMP28]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
+; Same test as above, but without the invalid parallel_loop_access metadata.
+
+; Here we can see the vectorizer does the mem dep checks and decides it is
+; unsafe to vectorize.
+define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @no-par-mem-metadata(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ 0, [[FOR_BODY3_LR_PH_US]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !1
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!3 = !{!4, !5}
+!4 = !{!4}
+!5 = !{!5}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
new file mode 100644
index 00000000000..f9ccbf146fc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin -mattr=+avx %s | FileCheck %s --check-prefixes=CHECK,AVX
+
+; Two mostly identical functions. The only difference is the presence of
+; fast-math flags on the second. The loop is a pretty simple reduction:
+
+; for (int i = 0; i < 32; ++i)
+;   if (arr[i] != 42)
+;     tot += arr[i];
+
+define double @sumIfScalar(double* nocapture readonly %arr) {
+; CHECK-LABEL: @sumIfScalar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; CHECK-NEXT:    [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
+; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; CHECK:       do.add:
+; CHECK-NEXT:    [[TOT_NEW:%.*]] = fadd double [[TOT]], [[NEXTVAL]]
+; CHECK-NEXT:    br label [[NEXT_ITER]]
+; CHECK:       no.add:
+; CHECK-NEXT:    br label [[NEXT_ITER]]
+; CHECK:       next.iter:
+; CHECK-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
+; CHECK:       done:
+; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.next, %next.iter]
+  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
+
+  %addr = getelementptr double, double* %arr, i32 %i
+  %nextval = load double, double* %addr
+
+  %tst = fcmp une double %nextval, 42.0
+  br i1 %tst, label %do.add, label %no.add
+
+do.add:
+  %tot.new = fadd double %tot, %nextval
+  br label %next.iter
+
+no.add:
+  br label %next.iter
+
+next.iter:
+  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
+  %i.next = add i32 %i, 1
+  %again = icmp ult i32 %i.next, 32
+  br i1 %again, label %loop, label %done
+
+done:
+  ret double %tot.next
+}
+
+define double @sumIfVector(double* nocapture readonly %arr) {
+; SSE-LABEL: @sumIfVector(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    br label [[LOOP:%.*]]
+; SSE:       loop:
+; SSE-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; SSE-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
+; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; SSE-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
+; SSE-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; SSE:       do.add:
+; SSE-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
+; SSE-NEXT:    br label [[NEXT_ITER]]
+; SSE:       no.add:
+; SSE-NEXT:    br label [[NEXT_ITER]]
+; SSE:       next.iter:
+; SSE-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; SSE-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; SSE-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; SSE-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
+; SSE:       done:
+; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
+; SSE-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+; AVX-LABEL: @sumIfVector(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; AVX-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
+; AVX-NEXT:    [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
+; AVX-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX:       middle.block:
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[PREDPHI]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[PREDPHI]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
+; AVX-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
+; AVX:       scalar.ph:
+; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[LOOP:%.*]]
+; AVX:       loop:
+; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; AVX-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
+; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; AVX-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
+; AVX-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; AVX:       do.add:
+; AVX-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
+; AVX-NEXT:    br label [[NEXT_ITER]]
+; AVX:       no.add:
+; AVX-NEXT:    br label [[NEXT_ITER]]
+; AVX:       next.iter:
+; AVX-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; AVX-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop !2
+; AVX:       done:
+; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.next, %next.iter]
+  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
+
+  %addr = getelementptr double, double* %arr, i32 %i
+  %nextval = load double, double* %addr
+
+  %tst = fcmp fast une double %nextval, 42.0
+  br i1 %tst, label %do.add, label %no.add
+
+do.add:
+  %tot.new = fadd fast double %tot, %nextval
+  br label %next.iter
+
+no.add:
+  br label %next.iter
+
+next.iter:
+  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
+  %i.next = add i32 %i, 1
+  %again = icmp ult i32 %i.next, 32
+  br i1 %again, label %loop, label %done
+
+done:
+  ret double %tot.next
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
new file mode 100644
index 00000000000..4d7c0b6f64b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; This test checks that gather/scatter not used for i128 data type.
+;CHECK-NOT: gather
+;CHECK-NOT: scatter
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = common global [151 x i128] zeroinitializer, align 16
+@.str = private unnamed_addr constant [46 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\0A\00", align 1
+@.str.1 = private unnamed_addr constant [44 x i8] c" FAIL.....Y3 1/1 (BUBBLE SORT), X(25) = %d\0A\00", align 1
+@str = private unnamed_addr constant [45 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\00"
+
+; Function Attrs: noinline nounwind uwtable
+declare i32 @y3inner() #0 
+
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %j.0 = phi i128 [ 99999, %entry ], [ %add10, %do.body ]
+  %i.0 = phi i128 [ 1, %entry ], [ %add11, %do.body ]
+  %and = and i128 %j.0, 32767
+  %idxprom = trunc i128 %i.0 to i64
+  %arrayidx = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom
+  store i128 %and, i128* %arrayidx, align 16
+  %add = add nuw nsw i128 %j.0, 11111
+  %and1 = and i128 %add, 32767
+  %add2 = add nuw nsw i128 %i.0, 1
+  %idxprom3 = trunc i128 %add2 to i64
+  %arrayidx4 = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom3
+  store i128 %and1, i128* %arrayidx4, align 16
+  %add5 = add nuw nsw i128 %j.0, 22222
+  %and6 = and i128 %add5, 32767
+  %add7 = add nuw nsw i128 %i.0, 2
+  %idxprom8 = trunc i128 %add7 to i64
+  %arrayidx9 = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom8
+  store i128 %and6, i128* %arrayidx9, align 16
+  %add10 = add nuw nsw i128 %j.0, 33333
+  %add11 = add nuw nsw i128 %i.0, 3
+  %cmp = icmp slt i128 %add11, 149
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 149), align 16
+  store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 150), align 16
+  %call = tail call i32 @y3inner()
+  %0 = load i128, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 25), align 16
+  %cmp12 = icmp eq i128 %0, 5085
+  br i1 %cmp12, label %if.then, label %if.else
+
+if.then:                                          ; preds = %do.end
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @str, i64 0, i64 0))
+  br label %if.end
+
+if.else:                                          ; preds = %do.end
+  %coerce.sroa.0.0.extract.trunc = trunc i128 %0 to i64
+  %coerce.sroa.2.0.extract.shift = lshr i128 %0, 64
+  %coerce.sroa.2.0.extract.trunc = trunc i128 %coerce.sroa.2.0.extract.shift to i64
+  %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([44 x i8], [44 x i8]* @.str.1, i64 0, i64 0), i64 %coerce.sroa.0.0.extract.trunc, i64 %coerce.sroa.2.0.extract.trunc)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @printf(i8*, ...) #1
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #2
+
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
new file mode 100644
index 00000000000..15ec344cc3d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -loop-vectorize -mtriple x86_64 -S | FileCheck %s
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+; The gaps between the memory access in this function are too large for
+; interleaving.
+
+; Test from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=7560
+define void @test1(%struct.ST4* noalias %B) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label %for.body
+
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %p1 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
+  store i32 65536, i32* %p1, align 4
+  %p2 = getelementptr i32, i32* %p1, i32 -2147483648
+  store i32 65536, i32* %p2, align 4
+  %p3 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
+  store i32 10, i32* %p3, align 4
+  %p4 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
+  store i32 12, i32* %p4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
new file mode 100644
index 00000000000..9294c92b575
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
+
+; NORMAL-LABEL: foo
+; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; NORMAL: %[[STRIDED1:.*]] = shufflevector <8 x i32> %[[WIDE]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]]
+
+; ATOM-LABEL: foo
+; ATOM: load i32
+; ATOM: load i32
+; ATOM: store i32
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %3, %1
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
new file mode 100644
index 00000000000..cf6cc1356e0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mattr=avx512f -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_load_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1>
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[A]], null
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[LATCH]], label [[COND_LOAD:%.*]]
+; CHECK:       cond_load:
+; CHECK-NEXT:    [[ALOAD:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7
+; CHECK:       for.end:
+; CHECK-NEXT:    [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA_LCSSA]]
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp ne i32* %a, null
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_load, label %latch
+
+cond_load:
+  %aload = load i32, i32* %a, align 4
+  br label %latch
+
+latch:
+  %a.lcssa = phi i32 [ %aload, %cond_load ], [ 1, %for.body ]
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 %a.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
new file mode 100644
index 00000000000..69f578cf789
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mattr=avx512f  -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; first test checks that loop with a reduction and a uniform store gets
+; vectorized.
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction
+; CHECK-LABEL: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <16 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <16 x i32>
+; CHECK:         [[ADD]] = add <16 x i32> %vec.phi, %wide.load
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK-NOT:     store i32 %ntrunc, i32* %a
+; CHECK:         %index.next = add i64 %index, 64
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <16 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT8]], <16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) {
+; CHECK-LABEL: @variant_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i32, i32* [[C]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[B]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[A]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ugt i8* [[UGLYGEP]], [[C5]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT16]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !21
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 8
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  %tmp3 = getelementptr inbounds i32, i32* %c, i64 %i
+  %tmp4 = load i32, i32* %tmp3, align 8
+  store i32 %tmp4, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/X86/lit.local.cfg
new file mode 100644
index 00000000000..e71f3cc4c41
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
new file mode 100644
index 00000000000..419cedbe145
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -0,0 +1,3374 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s  -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(int *A, int *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo1(i32* %A, i32* %B, i32* %trigger) {
+; AVX1-LABEL: @foo1(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX1-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !3
+; AVX1-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP5]], <8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !5, !noalias !7
+; AVX1-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP13]], <8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX1-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+;
+; AVX2-LABEL: @foo1(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 8
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 24
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 8
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 24
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX2-NEXT:    [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX2-NEXT:    [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP20]], <8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 8
+; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP21]], <8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP22]], <8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 24
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP23]], <8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP33]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP37]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 24
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP39]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 8
+; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 24
+; AVX2-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX2-NEXT:    [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX2-NEXT:    [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX2-NEXT:    [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP52]], <8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 8
+; AVX2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP53]], <8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
+; AVX2-NEXT:    [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP54]], <8 x i32>* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 24
+; AVX2-NEXT:    [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP55]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
+; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX2-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
+;
+; AVX512-LABEL: @foo1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 32
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 48
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 32
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 48
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX512-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX512-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP20]], <16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP21]], <16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 32
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP22]], <16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 48
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP23]], <16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP33]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 32
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP37]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 48
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP39]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
+; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 32
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 48
+; AVX512-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX512-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX512-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX512-NEXT:    [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP52]], <16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
+; AVX512-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP53]], <16 x i32>* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 32
+; AVX512-NEXT:    [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP54]], <16 x i32>* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 48
+; AVX512-NEXT:    [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP55]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
+; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX512-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
+;
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
+  store i32 %add, i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The same as @foo1 but all the pointers are address space 1 pointers.
+
+; Function Attrs: nounwind uwtable
+define void @foo1_addrspace1(i32 addrspace(1)*  %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) {
+; AVX1-LABEL: @foo1_addrspace1(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX1-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32 addrspace(1)* [[TMP3]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !14
+; AVX1-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP5]], <8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !16, !noalias !18
+; AVX1-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP12:%.*]] = bitcast i32 addrspace(1)* [[TMP11]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP13]], <8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX1-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP17:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP19:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP20:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+;
+; AVX2-LABEL: @foo1_addrspace1(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 8
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 24
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 8
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 24
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX2-NEXT:    [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX2-NEXT:    [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP20]], <8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 8
+; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP21]], <8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP22]], <8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 24
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP23]], <8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 24
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 8
+; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 24
+; AVX2-NEXT:    [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX2-NEXT:    [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX2-NEXT:    [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX2-NEXT:    [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP52]], <8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 8
+; AVX2-NEXT:    [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP53]], <8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
+; AVX2-NEXT:    [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP54]], <8 x i32> addrspace(1)* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 24
+; AVX2-NEXT:    [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP55]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
+; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX2-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
+;
+; AVX512-LABEL: @foo1_addrspace1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 32
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 48
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 32
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 48
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX512-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX512-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP20]], <16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP21]], <16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 32
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP22]], <16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 48
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP23]], <16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 32
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 48
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
+; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 32
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 48
+; AVX512-NEXT:    [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX512-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX512-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX512-NEXT:    [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP52]], <16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
+; AVX512-NEXT:    [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP53]], <16 x i32> addrspace(1)* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 32
+; AVX512-NEXT:    [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP54]], <16 x i32> addrspace(1)* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 48
+; AVX512-NEXT:    [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP55]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
+; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX512-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
+;
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 8
+  %B.addr = alloca i32 addrspace(1)*, align 8
+  %trigger.addr = alloca i32 addrspace(1)*, align 8
+  %i = alloca i32, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 8
+  store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 8
+  store i32 addrspace(1)* %trigger, i32 addrspace(1)** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom2
+  %6 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom4
+  %9 = load i32, i32 addrspace(1)* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %11, i64 %idxprom6
+  store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo2(float *A, float *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo2(float* %A, float* %B, i32* %trigger) {
+; AVX1-LABEL: @foo2(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
+; AVX1-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX1-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
+; AVX1-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX1-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
+; AVX1-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
+; AVX1-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
+; AVX1-NEXT:    [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX1-NEXT:    [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX1-NEXT:    [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX1-NEXT:    [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
+; AVX1-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX1-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
+; AVX1-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX1-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX1:       for.body.preheader:
+; AVX1-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+; AVX2-LABEL: @foo2(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX2-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
+; AVX2-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
+; AVX2-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
+; AVX2-NEXT:    [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX2-NEXT:    [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX2-NEXT:    [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX2-NEXT:    [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+; AVX512-LABEL: @foo2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 32
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 48
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
+; AVX512-NEXT:    [[TMP23:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float>
+; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float>
+; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <16 x i32> [[WIDE_LOAD24]] to <16 x float>
+; AVX512-NEXT:    [[TMP26:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX512-NEXT:    [[TMP29:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP26]], <16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 32
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP28]], <16 x float>* [[TMP35]], i32 4, <16 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 48
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP29]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+entry:
+  %A.addr = alloca float*, align 8
+  %B.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %A, float** %A.addr, align 8
+  store float* %B, float** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load float*, float** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds float, float* %5, i64 %idxprom2
+  %6 = load float, float* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to float
+  %add = fadd float %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load float*, float** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %11, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo3(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo3(double* %A, double* %B, i32* %trigger) #0 {
+; AVX1-LABEL: @foo3(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
+; AVX1-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX1-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
+; AVX1-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
+; AVX1-NEXT:    [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
+; AVX1-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
+; AVX1-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
+; AVX1-NEXT:    [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX1-NEXT:    [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX1-NEXT:    [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX1-NEXT:    [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
+; AVX1-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX1-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
+; AVX1-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+;
+; AVX2-LABEL: @foo3(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
+; AVX2-NEXT:    [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
+; AVX2-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
+; AVX2-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
+; AVX2-NEXT:    [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX2-NEXT:    [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX2-NEXT:    [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX2-NEXT:    [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX2-NEXT:    br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+;
+; AVX512-LABEL: @foo3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 16
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 24
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
+; AVX512-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double>
+; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double>
+; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x double>
+; AVX512-NEXT:    [[TMP26:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX512-NEXT:    [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP28]], <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 24
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP29]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !40
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo4(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i += 16) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i*2] + trigger[i]; << non-cosecutive access
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo4(double* %A, double* %B, i32* %trigger)  {
+; AVX1-LABEL: @foo4(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
+; AVX1-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       for.body.1:
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
+; AVX1-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 32
+; AVX1-NEXT:    br label [[FOR_BODY]]
+;
+; AVX2-LABEL: @foo4(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
+; AVX2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
+; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       for.body.1:
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP6]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP6]] to double
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP8]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP9]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP10]]
+; AVX2-NEXT:    [[TMP11:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP9]] to double
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP11]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64
+; AVX2-NEXT:    br label [[FOR_BODY]]
+;
+; AVX512-LABEL: @foo4(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 9985
+; AVX512-NEXT:    [[SCEVGEP12:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 9985
+; AVX512-NEXT:    [[SCEVGEP15:%.*]] = getelementptr double, double* [[B:%.*]], i64 19969
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP12]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND017:%.*]] = icmp ugt double* [[SCEVGEP15]], [[A]]
+; AVX512-NEXT:    [[BOUND118:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[VEC_IND_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[ENTRY]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP4:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP4]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP5]], i32 8, <8 x i1> [[TMP3]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP6:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20]], [[TMP6]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP7]], <8 x double*> [[TMP8]], i32 8, <8 x i1> [[TMP3]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[VEC_IND_NEXT:%.*]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP11]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20_1:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP12]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP13:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_1]] to <8 x double>
+; AVX512-NEXT:    [[TMP14:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_1]], [[TMP13]]
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP14]], <8 x double*> [[TMP15]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[VEC_IND_NEXT_1:%.*]] = add <8 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT_1]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_2]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP18:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT_1]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP18]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20_2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP19]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP20:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_2]] to <8 x double>
+; AVX512-NEXT:    [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_2]], [[TMP20]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT_1]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP21]], <8 x double*> [[TMP22]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24
+; AVX512-NEXT:    [[VEC_IND_NEXT_2]] = add <8 x i64> [[VEC_IND]], <i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384>
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT_2]], 624
+; AVX512-NEXT:    br i1 [[TMP23]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 9999, [[INDVARS_IV_PH]]
+; AVX512-NEXT:    br label [[FOR_BODY_PROL:%.*]]
+; AVX512:       for.body.prol:
+; AVX512-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_INC_PROL:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ]
+; AVX512-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
+; AVX512-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_PROL]]
+; AVX512-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX_PROL]], align 4
+; AVX512-NEXT:    [[CMP1_PROL:%.*]] = icmp slt i32 [[TMP25]], 100
+; AVX512-NEXT:    br i1 [[CMP1_PROL]], label [[IF_THEN_PROL:%.*]], label [[FOR_INC_PROL]]
+; AVX512:       if.then.prol:
+; AVX512-NEXT:    [[TMP26:%.*]] = shl nuw nsw i64 [[INDVARS_IV_PROL]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_PROL:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP26]]
+; AVX512-NEXT:    [[TMP27:%.*]] = load double, double* [[ARRAYIDX3_PROL]], align 8
+; AVX512-NEXT:    [[CONV_PROL:%.*]] = sitofp i32 [[TMP25]] to double
+; AVX512-NEXT:    [[ADD_PROL:%.*]] = fadd double [[TMP27]], [[CONV_PROL]]
+; AVX512-NEXT:    [[ARRAYIDX7_PROL:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_PROL]]
+; AVX512-NEXT:    store double [[ADD_PROL]], double* [[ARRAYIDX7_PROL]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_PROL]]
+; AVX512:       for.inc.prol:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 16
+; AVX512-NEXT:    [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1
+; AVX512-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0
+; AVX512-NEXT:    br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL]], !llvm.loop !50
+; AVX512:       for.body.prol.loopexit:
+; AVX512-NEXT:    [[DOTMASK:%.*]] = and i64 [[TMP24]], 9984
+; AVX512-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[DOTMASK]], 0
+; AVX512-NEXT:    br i1 [[TMP28]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL_LOOPEXIT]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[TMP30:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP30]]
+; AVX512-NEXT:    [[TMP31:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP31]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP32:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP32]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[TMP33:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP33]]
+; AVX512-NEXT:    [[TMP34:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP32]] to double
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP34]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], 32
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP35]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[TMP36:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP36]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP35]] to double
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP37]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], 48
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP38]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[TMP39:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP38]] to double
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP40]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], 64
+; AVX512-NEXT:    [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !52
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %4, 2
+  %idxprom2 = sext i32 %mul to i64
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+@a = common global [1 x i32*] zeroinitializer, align 8
+@c = common global i32* null, align 8
+
+; The loop here should not be vectorized due to trapping
+; constant expression
+; Function Attrs: nounwind uwtable
+define void @foo5(i32* %A, i32* %B, i32* %trigger) {
+; AVX1-LABEL: @foo5(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+; AVX2-LABEL: @foo5(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; AVX2-NEXT:    [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
+; AVX2-NEXT:    br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
+; AVX2:       if.then.4:
+; AVX2-NEXT:    [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_4]]
+; AVX2:       for.inc.4:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; AVX2-NEXT:    [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+; AVX512-LABEL: @foo5(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX512-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; AVX512-NEXT:    [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
+; AVX512-NEXT:    br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
+; AVX512:       if.then.4:
+; AVX512-NEXT:    [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_4]]
+; AVX512:       for.inc.4:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; AVX512-NEXT:    [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
+  store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Reverse loop
+;void foo6(double *in, double *out, unsigned size, int *trigger) {
+;
+;  for (int i=SIZE-1; i>=0; i--) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i] + (double) 0.5;
+;    }
+;  }
+;}
+
+
+define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
+; AVX1-LABEL: @foo6(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX1-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX1-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX1-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX1-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
+; AVX1-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX1-NEXT:    [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
+; AVX1-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
+; AVX1-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
+; AVX1-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
+; AVX1-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
+; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
+; AVX1-NEXT:    [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
+; AVX1-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
+; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
+; AVX1-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
+; AVX1-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
+; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
+; AVX1-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX1-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX1-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX1-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
+; AVX1-NEXT:    br i1 [[CMP_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+;
+; AVX2-LABEL: @foo6(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX2-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX2-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX2-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX2-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
+; AVX2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
+; AVX2-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX2-NEXT:    [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
+; AVX2-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
+; AVX2-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
+; AVX2-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
+; AVX2-NEXT:    [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
+; AVX2-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
+; AVX2-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX2-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
+; AVX2-NEXT:    [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
+; AVX2-NEXT:    br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+;
+; AVX512-LABEL: @foo6(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX512-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX512-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX512-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX512-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -7
+; AVX512-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -16
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -7
+; AVX512-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -24
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -7
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -7
+; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -7
+; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -16
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -7
+; AVX512-NEXT:    [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -24
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -7
+; AVX512-NEXT:    [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -7
+; AVX512-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -7
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -16
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -7
+; AVX512-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -24
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -7
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX512-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
+; AVX512-NEXT:    [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
+; AVX512-NEXT:    br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !62
+;
+entry:
+  %in.addr = alloca double*, align 8
+  %out.addr = alloca double*, align 8
+  %size.addr = alloca i32, align 4
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %in, double** %in.addr, align 8
+  store double* %out, double** %out.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 4095, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp sge i32 %0, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double*, double** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %add = fadd double %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %8, i64 %idxprom4
+  store double %add, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %dec = add nsw i32 %9, -1
+  store i32 %dec, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; void foo7 (double * __restrict__  out, double ** __restrict__  in,
+;           bool * __restrict__ trigger, unsigned size) {
+;
+;  for (unsigned i=0; i<size; i++)
+;    if (trigger[i] && (in[i] != 0))
+;      out[i] = (double) 0.5;
+; }
+
+define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
+; AVX-LABEL: @foo7(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX:       for.body.preheader:
+; AVX-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12
+; AVX-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX:       for.body.preheader16:
+; AVX-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX:       land.lhs.true:
+; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX:       if.then:
+; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX-NEXT:    br label [[FOR_INC]]
+; AVX:       for.inc:
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !52
+; AVX:       for.end:
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @foo7(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX512:       vector.ph:
+; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
+; AVX512:       middle.block:
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX512:       for.body.preheader16:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX512:       land.lhs.true:
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !64
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+entry:
+  %out.addr = alloca double*, align 8
+  %in.addr = alloca double**, align 8
+  %trigger.addr = alloca i8*, align 8
+  %size.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store double* %out, double** %out.addr, align 8
+  store double** %in, double*** %in.addr, align 8
+  store i8* %trigger, i8** %trigger.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %size.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = zext i32 %2 to i64
+  %3 = load i8*, i8** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = zext i32 %5 to i64
+  %6 = load double**, double*** %in.addr, align 8
+  %arrayidx2 = getelementptr inbounds double*, double** %6, i64 %idxprom1
+  %7 = load double*, double** %arrayidx2, align 8
+  %cmp3 = icmp ne double* %7, null
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %8 = load i32, i32* %i, align 4
+  %idxprom4 = zext i32 %8 to i64
+  %9 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4
+  store double 5.000000e-01, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %10 = load i32, i32* %i, align 4
+  %inc = add i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+;typedef int (*fp)();
+;void foo8 (double* __restrict__  out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
+;
+;  for (unsigned i=0; i<size; i++)
+;    if (trigger[i] && (in[i] != 0))
+;      out[i] = (double) 0.5;
+;}
+
+define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {
+; AVX-LABEL: @foo8(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX:       for.body.preheader:
+; AVX-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12
+; AVX-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX:       for.body.preheader16:
+; AVX-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX:       land.lhs.true:
+; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX:       if.then:
+; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX-NEXT:    br label [[FOR_INC]]
+; AVX:       for.inc:
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !55
+; AVX:       for.end:
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @foo8(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX512:       vector.ph:
+; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
+; AVX512:       middle.block:
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX512:       for.body.preheader16:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX512:       land.lhs.true:
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !67
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+entry:
+  %out.addr = alloca double*, align 8
+  %in.addr = alloca i32 ()**, align 8
+  %trigger.addr = alloca i8*, align 8
+  %size.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store double* %out, double** %out.addr, align 8
+  store i32 ()** %in, i32 ()*** %in.addr, align 8
+  store i8* %trigger, i8** %trigger.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %size.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = zext i32 %2 to i64
+  %3 = load i8*, i8** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = zext i32 %5 to i64
+  %6 = load i32 ()**, i32 ()*** %in.addr, align 8
+  %arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %6, i64 %idxprom1
+  %7 = load i32 ()*, i32 ()** %arrayidx2, align 8
+  %cmp3 = icmp ne i32 ()* %7, null
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %8 = load i32, i32* %i, align 4
+  %idxprom4 = zext i32 %8 to i64
+  %9 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4
+  store double 5.000000e-01, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %10 = load i32, i32* %i, align 4
+  %inc = add i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/max-mstore.ll b/llvm/test/Transforms/LoopVectorize/X86/max-mstore.ll
new file mode 100644
index 00000000000..a9ac04d4560
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/max-mstore.ll
@@ -0,0 +1,46 @@
+; RUN: opt -basicaa -loop-vectorize -force-vector-interleave=1 -S -mcpu=core-avx2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [256 x i32] zeroinitializer, align 16
+@a = common global [256 x i32] zeroinitializer, align 16
+
+; unsigned int a[256], b[256];
+; void foo() {
+;  for (i = 0; i < 256; i++) {
+;    if (b[i] > a[i])
+;      a[i] = b[i];
+;  }
+; }
+
+; CHECK-LABEL: foo
+; CHECK: load <8 x i32>
+; CHECK: icmp ugt <8 x i32>
+; CHECK: masked.store
+
+define void @foo() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* @b, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [256 x i32], [256 x i32]* @a, i64 0, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp ugt i32 %0, %1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i32 %0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
new file mode 100644
index 00000000000..709e69fbb1d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -0,0 +1,2473 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-threshold=150 -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DEFAULT
+; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -unroll-threshold=150 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
+
+; This file tests the llvm.loop.vectorize.enable metadata forcing
+; vectorization even when optimization levels are too low, or when
+; vectorization is disabled.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @enabled(
+; O1-NEXT:  entry:
+; O1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP78]]
+;
+; O2-LABEL: @enabled(
+; O2-NEXT:  entry:
+; O2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP78]]
+;
+; O3-LABEL: @enabled(
+; O3-NEXT:  entry:
+; O3-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP78]]
+;
+; O3DEFAULT-LABEL: @enabled(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+;
+; Os-LABEL: @enabled(
+; Os-NEXT:  entry:
+; Os-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Os-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP78]]
+;
+; Oz-LABEL: @enabled(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Oz-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Oz-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Oz-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Oz-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Oz-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Oz-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Oz-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Oz-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Oz-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Oz-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Oz-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Oz-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Oz-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Oz-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Oz-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Oz-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Oz-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Oz-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Oz-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Oz-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Oz-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Oz-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Oz-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Oz-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Oz-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Oz-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Oz-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Oz-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Oz-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Oz-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Oz-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Oz-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Oz-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Oz-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Oz-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Oz-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Oz-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Oz-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Oz-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Oz-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Oz-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Oz-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Oz-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Oz-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Oz-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Oz-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Oz-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Oz-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Oz-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Oz-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Oz-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Oz-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Oz-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Oz-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Oz-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Oz-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Oz-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Oz-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Oz-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Oz-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Oz-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Oz-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Oz-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Oz-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC-LABEL: @enabled(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC-LABEL: @enabled(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC2-LABEL: @enabled(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC2-LABEL: @enabled(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP78]]
+;
+; O3DIS-LABEL: @enabled(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DIS-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DIS-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DIS-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DIS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DIS-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DIS-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DIS-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DIS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DIS-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DIS-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DIS-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DIS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DIS-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DIS-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DIS-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DIS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DIS-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DIS-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DIS-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DIS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DIS-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DIS-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DIS-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DIS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DIS-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DIS-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DIS-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DIS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DIS-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DIS-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DIS-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DIS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DIS-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DIS-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DIS-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DIS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DIS-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DIS-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DIS-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DIS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DIS-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DIS-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DIS-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DIS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DIS-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DIS-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DIS-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DIS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DIS-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DIS-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DIS-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DIS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DIS-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DIS-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DIS-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DIS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DIS-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DIS-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DIS-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DIS-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DIS-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DIS-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DIS-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DIS-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP78]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @nopragma(
+; O1-NEXT:  entry:
+; O1-NEXT:    br label [[FOR_BODY:%.*]]
+; O1:       for.body:
+; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; O1:       for.end:
+; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP1]]
+;
+; O2-LABEL: @nopragma(
+; O2-NEXT:  entry:
+; O2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP78]]
+;
+; O3-LABEL: @nopragma(
+; O3-NEXT:  entry:
+; O3-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP78]]
+;
+; O3DEFAULT-LABEL: @nopragma(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+;
+; Os-LABEL: @nopragma(
+; Os-NEXT:  entry:
+; Os-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Os-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP78]]
+;
+; Oz-LABEL: @nopragma(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    br label [[FOR_BODY:%.*]]
+; Oz:       for.body:
+; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; Oz:       for.end:
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC-LABEL: @nopragma(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC-LABEL: @nopragma(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC2-LABEL: @nopragma(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; O1VEC2:       vector.ph:
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; O1VEC2:       vector.body:
+; O1VEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; O1VEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; O1VEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; O1VEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; O1VEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; O1VEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; O1VEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; O1VEC2:       middle.block:
+; O1VEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
+; O1VEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; O1VEC2:       scalar.ph:
+; O1VEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC2:       for.body:
+; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; O1VEC2:       for.end:
+; O1VEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP10]]
+;
+; OzVEC2-LABEL: @nopragma(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; OzVEC2:       vector.ph:
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; OzVEC2:       vector.body:
+; OzVEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; OzVEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; OzVEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; OzVEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; OzVEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; OzVEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; OzVEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; OzVEC2:       middle.block:
+; OzVEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
+; OzVEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OzVEC2:       scalar.ph:
+; OzVEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC2:       for.body:
+; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; OzVEC2:       for.end:
+; OzVEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP10]]
+;
+; O3DIS-LABEL: @nopragma(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
+; O3DIS:       for.body:
+; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; O3DIS:       for.end:
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @disabled(
+; O1-NEXT:  entry:
+; O1-NEXT:    br label [[FOR_BODY:%.*]]
+; O1:       for.body:
+; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O1:       for.end:
+; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP1]]
+;
+; O2-LABEL: @disabled(
+; O2-NEXT:  entry:
+; O2-NEXT:    br label [[FOR_BODY:%.*]]
+; O2:       for.body:
+; O2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O2:       for.end:
+; O2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP1]]
+;
+; O3-LABEL: @disabled(
+; O3-NEXT:  entry:
+; O3-NEXT:    br label [[FOR_BODY:%.*]]
+; O3:       for.body:
+; O3-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O3-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O3:       for.end:
+; O3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP1]]
+;
+; O3DEFAULT-LABEL: @disabled(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP1]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX_4]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_4]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX_8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX2_8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX_12]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX2_12]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX_16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP19]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX2_16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX_20]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX2_20]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX_24]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = load <4 x i32>, <4 x i32>* [[TMP26]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX2_24]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = bitcast i32* [[ARRAYIDX_28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = load <4 x i32>, <4 x i32>* [[TMP30]], align 4
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX2_28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], <4 x i32>* [[TMP33]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[ARRAYIDX_32]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX2_32]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = bitcast i32* [[ARRAYIDX_36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = load <4 x i32>, <4 x i32>* [[TMP38]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[ARRAYIDX2_36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP41]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[ARRAYIDX_40]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = load <4 x i32>, <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = bitcast i32* [[ARRAYIDX2_40]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* [[TMP45]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = bitcast i32* [[ARRAYIDX_44]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = load <4 x i32>, <4 x i32>* [[TMP46]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[ARRAYIDX2_44]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP50]]
+;
+; Os-LABEL: @disabled(
+; Os-NEXT:  entry:
+; Os-NEXT:    br label [[FOR_BODY:%.*]]
+; Os:       for.body:
+; Os-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Os-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Os-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Os-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Os-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Os-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; Os-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; Os:       for.end:
+; Os-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP1]]
+;
+; Oz-LABEL: @disabled(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    br label [[FOR_BODY:%.*]]
+; Oz:       for.body:
+; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; Oz:       for.end:
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC-LABEL: @disabled(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC:       for.body:
+; O1VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1VEC-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1VEC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1VEC-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1VEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O1VEC:       for.end:
+; O1VEC-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP1]]
+;
+; OzVEC-LABEL: @disabled(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC:       for.body:
+; OzVEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; OzVEC-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; OzVEC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; OzVEC-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; OzVEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; OzVEC:       for.end:
+; OzVEC-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC2-LABEL: @disabled(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC2:       for.body:
+; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !4
+; O1VEC2:       for.end:
+; O1VEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP1]]
+;
+; OzVEC2-LABEL: @disabled(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC2:       for.body:
+; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !4
+; OzVEC2:       for.end:
+; OzVEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP1]]
+;
+; O3DIS-LABEL: @disabled(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
+; O3DIS:       for.body:
+; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O3DIS:       for.end:
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 48
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 1}
+!2 = !{!2, !3}
+!3 = !{!"llvm.loop.vectorize.enable", i1 0}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
new file mode 100644
index 00000000000..bb972c4488a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -vectorizer-min-trip-count=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: <4 x float>
+define void @trivial_loop(float* nocapture %a) nounwind uwtable optsize {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, 1.000000e+00
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
new file mode 100644
index 00000000000..ad79e38cafa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -0,0 +1,145 @@
+; REQUIRES: asserts
+; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i8 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+; sources of the mul is sext\sext from i8 
+; use pmullw\sext seq.   
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i8
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i8 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i8
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i8 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-120
+; use pmullw\sext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -120, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\250
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 250, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-120
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -120, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\250
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i16 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 1
+  %conv = sext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = sext i16 %1 to i32
+; sources of the mul is sext\sext from i16 
+; use pmulhw\pmullw\pshuf seq.   
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i16
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i16 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i16
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i16 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-32000
+; use pmulhw\pmullw\sext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -32000, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\64000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 64000, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-32000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -32000, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\64000
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no-vector.ll b/llvm/test/Transforms/LoopVectorize/X86/no-vector.ll
new file mode 100644
index 00000000000..4b464b01267
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/no-vector.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -mtriple=i386-unknown-freebsd -mcpu=i486 -loop-vectorize < %s
+
+define i32 @PR14639(i8* nocapture %s, i32 %len) nounwind {
+entry:
+  %cmp4 = icmp sgt i32 %len, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.05 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %s, i32 %i.06
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %xor = xor i32 %conv, %r.05
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %len
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %r.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
new file mode 100644
index 00000000000..177c120b3df
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
+; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !10
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !9
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !15
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !9
+  %add = fadd double %a.08, %cond, !dbg !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !8
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !8
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !8, !llvm.loop !17
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !22
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !23
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !23
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !24
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !22
+  %add = fadd double %a.08, %cond, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !21
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !21, !llvm.loop !26
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!28}
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 3.7.0"}
+!3 = !DILocation(line: 5, column: 20, scope: !4)
+!4 = distinct !DISubprogram(name: "cond_sum", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!5 = !DIFile(filename: "no_fpmath.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 5, column: 3, scope: !4)
+!9 = !DILocation(line: 6, column: 14, scope: !4)
+!10 = !DILocation(line: 9, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 6, column: 19, scope: !4)
+!16 = !DILocation(line: 6, column: 11, scope: !4)
+!17 = distinct !{!17, !18}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 16, column: 20, scope: !20)
+!20 = distinct !DISubprogram(name: "cond_sum_loop_hint", scope: !5, file: !5, line: 12, type: !6, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!21 = !DILocation(line: 16, column: 3, scope: !20)
+!22 = !DILocation(line: 17, column: 14, scope: !20)
+!23 = !DILocation(line: 20, column: 3, scope: !20)
+!24 = !DILocation(line: 17, column: 19, scope: !20)
+!25 = !DILocation(line: 17, column: 11, scope: !20)
+!26 = distinct !{!26, !27, !18}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}
+!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
new file mode 100644
index 00000000000..fbc4ab2aa86
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -0,0 +1,113 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
+
+; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
+; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 !prof !29 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8, !prof !30
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !10
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !9
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !15
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !9
+  %add = fadd double %a.08, %cond, !dbg !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !8
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !8
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !8, !llvm.loop !17, !prof !31
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 !prof !29{
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21, !prof !30
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !22
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !23
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !23
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !24
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !22
+  %add = fadd double %a.08, %cond, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !21
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !21, !llvm.loop !26, !prof !31
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!28}
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 3.7.0"}
+!3 = !DILocation(line: 5, column: 20, scope: !4)
+!4 = distinct !DISubprogram(name: "cond_sum", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!5 = !DIFile(filename: "no_fpmath.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 5, column: 3, scope: !4)
+!9 = !DILocation(line: 6, column: 14, scope: !4)
+!10 = !DILocation(line: 9, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 6, column: 19, scope: !4)
+!16 = !DILocation(line: 6, column: 11, scope: !4)
+!17 = distinct !{!17, !18}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 16, column: 20, scope: !20)
+!20 = distinct !DISubprogram(name: "cond_sum_loop_hint", scope: !5, file: !5, line: 12, type: !6, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!21 = !DILocation(line: 16, column: 3, scope: !20)
+!22 = !DILocation(line: 17, column: 14, scope: !20)
+!23 = !DILocation(line: 20, column: 3, scope: !20)
+!24 = !DILocation(line: 17, column: 19, scope: !20)
+!25 = !DILocation(line: 17, column: 11, scope: !20)
+!26 = distinct !{!26, !27, !18}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}
+!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)
+!29 = !{!"function_entry_count", i64 3}
+!30 = !{!"branch_weights", i32 99, i32 1}
+!31 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
new file mode 100644
index 00000000000..9fa65534f32
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; This test verifies that the loop vectorizer will NOT vectorize loops that
+; will produce a tail loop with the optimize for size or the minimize size
+; attributes. This is a target-dependent version of the test.
+; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
+; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
+
+; We can't vectorize this one because we version for stride==1; even having TC
+; a multiple of VF.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+; AUTOVF-LABEL: @scev4stride1
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 256
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+attributes #2 = { optsize }
+
+
+; PR39497
+; We can't vectorize this one because we version for overflow check and tiny
+; trip count leads to opt-for-size (which otherwise could fold the tail by
+; masking).
+; CHECK-LABEL: @main
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.cond:
+; AUTOVF-LABEL: @main
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.cond:
+define i32 @main() local_unnamed_addr {
+while.cond:
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ]
+  %conv = and i32 %d.0, 65535
+  %cmp = icmp ult i32 %conv, 4
+  %add = add nuw nsw i32 %conv, 1
+  br i1 %cmp, label %for.cond, label %while.cond.loopexit
+
+while.cond.loopexit:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
new file mode 100644
index 00000000000..3f22b56290e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -0,0 +1,114 @@
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
+
+; extern int arr[8][8];
+; extern int arr2[8];
+;
+; void foo(int n)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable)
+;   for (i1 = 0; i1 < 8; i1++) {
+;     arr2[i1] = i1;
+;     for (i2 = 0; i2 < 8; i2++)
+;       arr[i2][i1] = i1 + n;
+;   }
+; }
+;
+
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+; AVX-LABEL: vector.ph:
+; AVX: %[[SplatVal:.*]] = insertelement <8 x i32> undef, i32 %n, i32 0
+; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> undef, <8 x i32> zeroinitializer
+
+; AVX-LABEL: vector.body:
+; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; AVX: %[[VecInd:.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <8 x i64> %[[VecInd]]
+; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
+; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[VecIndTr]], <8 x i32*> %[[AAddr]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
+; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]]
+; AVX: br label %[[InnerLoop:.+]]
+
+; AVX: [[InnerLoop]]:
+; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]]
+; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[StoreVal]], <8 x i32*> %[[AAddr2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true
+; AVX: %[[InnerPhiNext]] = add nuw nsw <8 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX: %[[VecCond:.*]] = icmp eq <8 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; AVX: %[[InnerCond:.*]] = extractelement <8 x i1> %[[VecCond]], i32 0
+; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; AVX: [[ForInc]]:
+; AVX: %[[IndNext]] = add i64 %[[Ind]], 8
+; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; AVX: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; AVX: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+@arr2 = external global [8 x i32], align 16
+@arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
new file mode 100644
index 00000000000..0e6f06497e3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The parallel loop has been invalidated by the new memory accesses introduced
+; by reg2mem (Loop::isParallel() starts to return false). Ensure the loop is
+; now non-vectorizable.
+
+;CHECK-NOT: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  %indvars.iv.next.reg2mem = alloca i64
+  %indvars.iv.reg2mem = alloca i64
+  %"reg2mem alloca point" = bitcast i32 0 to i32
+  store i64 0, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
+  %indvars.iv.reload = load i64, i64* %indvars.iv.reg2mem
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.reload
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.reload
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !4
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !4
+  %indvars.iv.next = add i64 %indvars.iv.reload, 1
+  ; A new store without the parallel metadata here:
+  store i64 %indvars.iv.next, i64* %indvars.iv.next.reg2mem
+  %indvars.iv.next.reload1 = load i64, i64* %indvars.iv.next.reg2mem
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next.reload1
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !4
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !4
+  %indvars.iv.next.reload = load i64, i64* %indvars.iv.next.reg2mem
+  %lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop !3
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %indvars.iv.next.reload2 = load i64, i64* %indvars.iv.next.reg2mem
+  store i64 %indvars.iv.next.reload2, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !4}}
+!4 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
new file mode 100644
index 00000000000..be778850096
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -0,0 +1,115 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; A tricky loop:
+;
+; void loop(int *a, int *b) {
+;    for (int i = 0; i < 512; ++i) {
+;        a[a[i]] = b[i];
+;        a[i] = b[i+1];
+;    }
+;}
+
+;CHECK-LABEL: @loop(
+;CHECK-NOT: <4 x i32>
+define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4
+  store i32 %2, i32* %arrayidx2, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with parallel loop metadata added to the loop branch
+; and the memory instructions.
+
+;CHECK-LABEL: @parallel_loop(
+;CHECK: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !13
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  ; This store might have originated from inlining a function with a parallel
+  ; loop. Refers to a list with the "original loop reference" (!4) also included.
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !15
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !13
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with an illegal parallel loop metadata: the memory
+; accesses refer to a different loop's identifier.
+
+;CHECK-LABEL: @mixed_metadata(
+;CHECK-NOT: <4 x i32>
+
+define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !16
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !16
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  ; This refers to the loop marked with !7 which we are not in at the moment.
+  ; It should prevent detecting as a parallel loop.
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !17
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !16
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !16
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13, !15}}
+!4 = !{!4, !{!"llvm.loop.parallel_accesses", !14, !15}}
+!6 = !{!6, !{!"llvm.loop.parallel_accesses", !16}}
+!7 = !{!7, !{!"llvm.loop.parallel_accesses", !17}}
+!13 = distinct !{}
+!14 = distinct !{}
+!15 = distinct !{}
+!16 = distinct !{}
+!17 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/powof2div.ll b/llvm/test/Transforms/LoopVectorize/X86/powof2div.ll
new file mode 100644
index 00000000000..3e4bef6d4d0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/powof2div.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+
+@Foo = common global %struct.anon zeroinitializer, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: sdiv <4 x i32>
+; CHECK: store <4 x i32>
+
+define void @foo(){
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %div = sdiv i32 %0, 2
+  %arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  store i32 %div, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
new file mode 100644
index 00000000000..6b61ddbc413
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Ensure that the 'inbounds' is preserved on the GEPs that feed the load and store in the loop.
+define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrspace(1)* align 8 dereferenceable_or_null(8), i64) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[DOT10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP0:%.*]], i64 16
+; CHECK-NEXT:    [[DOT11:%.*]] = bitcast i8 addrspace(1)* [[DOT10]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16
+; CHECK-NEXT:    [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2:%.*]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], 1
+; CHECK-NEXT:    [[UMAX1:%.*]] = select i1 [[TMP4]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[UMAX1]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 [[TMP6]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[DOT10]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[DOT12]], [[SCEVGEP]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[UMAX]], -16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP7]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP8]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 4
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP9]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP10]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP11]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP12]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 12
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP13]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP14]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP15]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP16]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP17]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD6]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP18]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP19]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD7]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP20]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 12
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP21]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD8]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP22]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DOT18:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    [[V:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT18]], align 8
+; CHECK-NEXT:    [[DOT20:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    store i8 addrspace(1)* [[V]], i8 addrspace(1)* addrspace(1)* [[DOT20]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1
+; CHECK-NEXT:    [[DOT21:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT4]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop !7
+; CHECK:       loopexit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %preheader
+
+preheader:                                       ; preds = %4
+  %.10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 16
+  %.11 = bitcast i8 addrspace(1)* %.10 to i8 addrspace(1)* addrspace(1)*
+  %.12 = getelementptr inbounds i8, i8 addrspace(1)* %1, i64 16
+  %.13 = bitcast i8 addrspace(1)* %.12 to i8 addrspace(1)* addrspace(1)*
+  br label %loop
+
+loop:
+  %indvars.iv3 = phi i64 [ 0, %preheader ], [ %indvars.iv.next4, %loop ]
+  %.18 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.13, i64 %indvars.iv3
+  %v = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.18, align 8
+  %.20 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.11, i64 %indvars.iv3
+  store i8 addrspace(1)* %v, i8 addrspace(1)* addrspace(1)* %.20, align 8
+  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
+  %.21 = icmp ult i64 %indvars.iv.next4, %2
+  br i1 %.21, label %loop, label %loopexit
+
+loopexit:
+  ret void
+}
+
+attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+
+!0 = !{i32 0, i32 2147483646}
+!1 = !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
new file mode 100644
index 00000000000..efbfecc2f54
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -0,0 +1,36 @@
+; PR34438
+; Loop has a short trip count of 8 iterations. It should be vectorized because no runtime checks or tail loop are necessary.
+; Two cases tested AVX (MaxVF=8 = TripCount) and AVX512 (MaxVF=16 > TripCount)
+
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @small_tc(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @small_tc
+; CHECK:    load <8 x float>, <8 x float>*
+; CHECK:    fadd fast <8 x float>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !5
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !5
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !5}}
+!4 = !{!4}
+!5 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
new file mode 100644
index 00000000000..1f2a2061586
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s
+
+; The test checks that there is no assert caused by issue described in PR35432
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common local_unnamed_addr global [192 x [192 x i32]] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @main() local_unnamed_addr #0 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[S:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[I]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[S]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull [[I]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
+; CHECK-NEXT:    [[STOREMERGE6:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[STOREMERGE6]], i16* [[S]], align 2
+; CHECK-NEXT:    [[CONV17:%.*]] = and i32 [[TMP2]], 65472
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[CONV17]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END12:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 -1, [[TMP2]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[STOREMERGE_IN9:%.*]] = phi i32 [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_INC9:%.*]] ]
+; CHECK-NEXT:    [[CONV52:%.*]] = and i32 [[STOREMERGE_IN9]], 255
+; CHECK-NEXT:    [[CMP63:%.*]] = icmp ult i32 [[TMP2]], [[CONV52]]
+; CHECK-NEXT:    br i1 [[CMP63]], label [[FOR_BODY8_LR_PH:%.*]], label [[FOR_INC9]]
+; CHECK:       for.body8.lr.ph:
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[STOREMERGE_IN9]] to i8
+; CHECK-NEXT:    [[DOTPROMOTED:%.*]] = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 -1, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP7]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[UMAX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 -1, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[UMAX1:%.*]] = select i1 [[TMP13]], i32 [[TMP3]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[UMAX1]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
+; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP16]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = add i8 [[TMP10]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub i8 [[TMP10]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt i8 [[TMP18]], [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i8 [[TMP17]], [[TMP10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 true, i1 [[TMP19]], i1 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i32 [[TMP15]], 255
+; CHECK-NEXT:    [[TMP23:%.*]] = or i1 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i1 [[TMP23]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i1 false, [[TMP24]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i8 [[CONV3]], [[CAST_CRD]]
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[DOTPROMOTED]], i32 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP26]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc i32 [[INDEX]] to i8
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i8 [[CONV3]], [[TMP27]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 0, i8 -1, i8 -2, i8 -3>
+; CHECK-NEXT:    [[INDUCTION3:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 -4, i8 -5, i8 -6, i8 -7>
+; CHECK-NEXT:    [[TMP28:%.*]] = add i8 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = add i8 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[TMP30]] = add <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP31]] = add <4 x i32> [[VEC_PHI2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP32:%.*]] = add i8 [[TMP28]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = add i8 [[TMP29]], -1
+; CHECK-NEXT:    [[TMP34:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP31]], [[TMP30]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX4]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY8:%.*]]
+; CHECK:       for.body8:
+; CHECK-NEXT:    [[INC5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY8]] ]
+; CHECK-NEXT:    [[C_04:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[FOR_BODY8]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[INC5]], 1
+; CHECK-NEXT:    [[DEC]] = add i8 [[C_04]], -1
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[DEC]] to i32
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp ult i32 [[TMP2]], [[CONV5]]
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop !2
+; CHECK:       for.cond4.for.inc9_crit_edge:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY8]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i32 [[INC_LCSSA]], i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+; CHECK-NEXT:    br label [[FOR_INC9]]
+; CHECK:       for.inc9:
+; CHECK-NEXT:    [[CONV10:%.*]] = and i32 [[STOREMERGE_IN9]], 65535
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[CONV10]], 1
+; CHECK-NEXT:    [[CONV1:%.*]] = and i32 [[ADD]], 65472
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV1]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END12_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end12_crit_edge:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INC9]] ]
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = trunc i32 [[ADD_LCSSA]] to i16
+; CHECK-NEXT:    store i16 [[STOREMERGE]], i16* [[S]], align 2
+; CHECK-NEXT:    br label [[FOR_END12]]
+; CHECK:       for.end12:
+; CHECK-NEXT:    [[CALL13:%.*]] = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull [[S]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %i = alloca i32, align 4
+  %s = alloca i16, align 2
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
+  store i32 0, i32* %i, align 4
+  %1 = bitcast i16* %s to i8*
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull %1) #3
+  %call = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i) #3
+  %2 = load i32, i32* %i, align 4
+  %storemerge6 = trunc i32 %2 to i16
+  store i16 %storemerge6, i16* %s, align 2
+  %conv17 = and i32 %2, 65472
+  %cmp8 = icmp eq i32 %conv17, 0
+  br i1 %cmp8, label %for.body.lr.ph, label %for.end12
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc9
+  %storemerge.in9 = phi i32 [ %2, %for.body.lr.ph ], [ %add, %for.inc9 ]
+  %conv52 = and i32 %storemerge.in9, 255
+  %cmp63 = icmp ult i32 %2, %conv52
+  br i1 %cmp63, label %for.body8.lr.ph, label %for.inc9
+
+for.body8.lr.ph:                                  ; preds = %for.body
+  %conv3 = trunc i32 %storemerge.in9 to i8
+  %.promoted = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body8.lr.ph, %for.body8
+  %inc5 = phi i32 [ %.promoted, %for.body8.lr.ph ], [ %inc, %for.body8 ]
+  %c.04 = phi i8 [ %conv3, %for.body8.lr.ph ], [ %dec, %for.body8 ]
+  %inc = add i32 %inc5, 1
+  %dec = add i8 %c.04, -1
+  %conv5 = zext i8 %dec to i32
+  %cmp6 = icmp ult i32 %2, %conv5
+  br i1 %cmp6, label %for.body8, label %for.cond4.for.inc9_crit_edge
+
+for.cond4.for.inc9_crit_edge:                     ; preds = %for.body8
+  %inc.lcssa = phi i32 [ %inc, %for.body8 ]
+  store i32 %inc.lcssa, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  br label %for.inc9
+
+for.inc9:                                         ; preds = %for.cond4.for.inc9_crit_edge, %for.body
+  %conv10 = and i32 %storemerge.in9, 65535
+  %add = add nuw nsw i32 %conv10, 1
+  %conv1 = and i32 %add, 65472
+  %cmp = icmp eq i32 %conv1, 0
+  br i1 %cmp, label %for.body, label %for.cond.for.end12_crit_edge
+
+for.cond.for.end12_crit_edge:                     ; preds = %for.inc9
+  %add.lcssa = phi i32 [ %add, %for.inc9 ]
+  %storemerge = trunc i32 %add.lcssa to i16
+  store i16 %storemerge, i16* %s, align 2
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.cond.for.end12_crit_edge, %entry
+  %call13 = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull %s) #3
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare i32 @goo(...) local_unnamed_addr #2
+
+declare i32 @foo(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
new file mode 100644
index 00000000000..51d70d773a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 2, i64 3, i64 4, i64 5>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = add i64 2, [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[OFFSET_IDX1]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+;
+entry:
+  br label %loop
+
+loop:
+  %0 = phi i64 [ 2, %entry ], [ %3, %loop ]
+  %1 = and i64 %0, 4294967295
+  %2 = trunc i64 %0 to i32
+  %3 = add nuw nsw i64 %1, 1
+  %4 = icmp sgt i32 %2, 80
+  br i1 %4, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
new file mode 100644
index 00000000000..38ca5f964a0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
@@ -0,0 +1,98 @@
+; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure that we can compile the test without crash.
+define void @barney() {
+
+; CHECK-LABEL: @barney(
+; CHECK:       middle.block:
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %tmp4 = icmp slt i32 undef, 0
+  br i1 %tmp4, label %bb2, label %bb5
+
+bb5:                                              ; preds = %bb2
+  br label %bb19
+
+bb18:                                             ; preds = %bb33
+  ret void
+
+bb19:                                             ; preds = %bb36, %bb5
+  %tmp21 = phi i64 [ undef, %bb36 ], [ 2, %bb5 ]
+  %tmp22 = phi i32 [ %tmp65, %bb36 ], [ undef, %bb5 ]
+  br label %bb50
+
+bb33:                                             ; preds = %bb62
+  br i1 undef, label %bb18, label %bb36
+
+bb36:                                             ; preds = %bb33
+  br label %bb19
+
+bb46:                                             ; preds = %bb50
+  br i1 undef, label %bb48, label %bb59
+
+bb48:                                             ; preds = %bb46
+  %tmp49 = add i32 %tmp52, 14
+  ret void
+
+bb50:                                             ; preds = %bb50, %bb19
+  %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ]
+  %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ]
+  %tmp54 = add i32 %tmp52, 12
+  %tmp55 = add i32 %tmp52, 13
+  %tmp56 = add nuw nsw i64 %tmp53, 1
+  %tmp58 = icmp ult i64 %tmp53, undef
+  br i1 %tmp58, label %bb50, label %bb46
+
+bb59:                                             ; preds = %bb46
+  br label %bb62
+
+bb62:                                             ; preds = %bb68, %bb59
+  %tmp63 = phi i32 [ %tmp65, %bb68 ], [ %tmp55, %bb59 ]
+  %tmp64 = phi i64 [ %tmp66, %bb68 ], [ %tmp56, %bb59 ]
+  %tmp65 = add i32 %tmp63, 13
+  %tmp66 = add nuw nsw i64 %tmp64, 1
+  %tmp67 = icmp ult i64 %tmp66, %tmp21
+  br i1 %tmp67, label %bb68, label %bb33
+
+bb68:                                             ; preds = %bb62
+  br label %bb62
+}
+
+define i32 @foo(i32 addrspace(1)* %p) {
+
+; CHECK-LABEL: foo
+; CHECK:       middle.block:
+
+entry:
+  br label %outer
+
+outer:                                            ; preds = %outer_latch, %entry
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %outer_latch ]
+  br label %inner
+
+inner:                                            ; preds = %inner, %outer
+  %0 = phi i32 [ %2, %inner ], [ 0, %outer ]
+  %a = phi i32 [ %3, %inner ], [ 1, %outer ]
+  %b = phi i32 [ %1, %inner ], [ 6, %outer ]
+  %1 = add i32 %b, 2
+  %2 = or i32 %0, %b
+  %3 = add nuw nsw i32 %a, 1
+  %4 = zext i32 %3 to i64
+  %5 = icmp ugt i64 %iv, %4
+  br i1 %5, label %inner, label %outer_latch
+
+outer_latch:                                      ; preds = %inner
+  store atomic i32 %2, i32 addrspace(1)* %p unordered, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %6 = icmp ugt i64 %iv, 63
+  br i1 %6, label %exit, label %outer
+
+exit:                                             ; preds = %outer_latch
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/propagate-metadata.ll b/llvm/test/Transforms/LoopVectorize/X86/propagate-metadata.ll
new file mode 100644
index 00000000000..2825ddbac9d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/propagate-metadata.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -mtriple="x86_64-unknown-linux-gnu" -loop-vectorize < %s | FileCheck %s
+
+; Don't crash on unknown metadata
+; CHECK-LABEL: @no_propagate_range_metadata(
+; CHECK: load <16 x i8>
+; CHECK: store <16 x i8>
+define void @no_propagate_range_metadata(i8* readonly %first.coerce, i8* readnone %last.coerce, i8* nocapture %result) {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %result.addr.05 = phi i8* [ %incdec.ptr, %for.body ], [ %result, %for.body.preheader ]
+  %first.sroa.0.04 = phi i8* [ %incdec.ptr.i.i.i, %for.body ], [ %first.coerce, %for.body.preheader ]
+  %0 = load i8, i8* %first.sroa.0.04, align 1, !range !0
+  store i8 %0, i8* %result.addr.05, align 1
+  %incdec.ptr.i.i.i = getelementptr inbounds i8, i8* %first.sroa.0.04, i64 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %result.addr.05, i64 1
+  %lnot.i = icmp eq i8* %incdec.ptr.i.i.i, %last.coerce
+  br i1 %lnot.i, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+!0 = !{i8 0, i8 2}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll
new file mode 100644
index 00000000000..13ceaef4dbb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll
@@ -0,0 +1,20 @@
+; RUN: opt -loop-vectorize -S %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f(i128 %p1) {
+entry:
+  br label %while.body
+
+while.body:
+  %p.05 = phi i8* [ %add.ptr, %while.body ], [ null, %entry ]
+  %p1.addr.04 = phi i128 [ %sub, %while.body ], [ %p1, %entry ]
+  %add.ptr = getelementptr inbounds i8, i8* %p.05, i32 2
+  %sub = add nsw i128 %p1.addr.04, -2
+  %tobool = icmp eq i128 %sub, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
new file mode 100644
index 00000000000..4284fbacfa7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -0,0 +1,33 @@
+; RUN: opt -slp-vectorizer -S %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; This test used to fail under libgmalloc. Because we would try to access a
+; pointer that was already deleted.
+;
+; llvm-lit -v --param use_gmalloc=1 --param
+;   gmalloc_path=/usr/lib/libgmalloc.dylib
+;   test/Transforms/LoopVectorize/X86/rauw-bug.ll
+;
+; radar://15498655
+
+; CHECK: reduced
+define void @reduced()  {
+entry:
+  br i1 undef, label %while.body, label %while.cond63.preheader.while.end76_crit_edge
+
+while.cond63.preheader.while.end76_crit_edge:
+  ret void
+
+while.body:
+  %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
+  %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
+  %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
+  %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+  %div50 = fmul double %d3_fx.012, 1.250000e-01
+  %sub52 = fsub double 0.000000e+00, %div50
+  %div56 = fmul double %d3_fy.013, 1.250000e-01
+  %sub58 = fsub double 0.000000e+00, %div56
+  br label %while.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
new file mode 100644
index 00000000000..6393002d507
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -loop-vectorize -mcpu=prescott -disable-basicaa < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-apple-darwin"
+
+; PR15344
+define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
+; CHECK-LABEL: @test1(
+; CHECK: preheader
+; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: vector.memcheck
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb
+  %tmp = load double, double* null, align 8
+  br i1 undef, label %bb3, label %bb12
+
+bb3:                                              ; preds = %bb3, %bb2
+  %tmp4 = phi double [ %tmp9, %bb3 ], [ %tmp, %bb2 ]
+  %tmp5 = phi i32 [ %tmp8, %bb3 ], [ 0, %bb2 ]
+  %tmp6 = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 %tmp5
+  %tmp7 = load double, double* %tmp6, align 4
+  %tmp8 = add nsw i32 %tmp5, 1
+  %tmp9 = fadd fast double %tmp4, undef
+  %tmp10 = getelementptr inbounds float, float* %arg, i32 %tmp5
+  store float undef, float* %tmp10, align 4
+  %tmp11 = icmp eq i32 %tmp8, %arg1
+  br i1 %tmp11, label %bb12, label %bb3
+
+bb12:                                             ; preds = %bb3, %bb2
+  %tmp13 = phi double [ %tmp, %bb2 ], [ %tmp9, %bb3 ]
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
new file mode 100644
index 00000000000..1146e31ec25
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -0,0 +1,112 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @reduction_sum_float_ieee(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_ieee(
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK-NOT: %wide.load = load <4 x float>, <4 x float>*
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_fastmath(
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd fast float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_only_reassoc(
+; CHECK-NOT: fadd fast
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd reassoc float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_only_reassoc_and_contract(
+; CHECK-NOT: fadd fast
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd reassoc contract float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
new file mode 100644
index 00000000000..7c29faa51e6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -0,0 +1,80 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -debug-only=loop-vectorize -S < %s 2>&1  | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Make sure we ignore the costs of the redundant reduction casts
+; char reduction_i8(char *a, char *b, int n) {
+;   char sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+
+; CHECK-LABEL: reduction_i8
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+;
+define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.12 = icmp sgt i32 %n, 0
+  br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i8 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %conv4 = and i32 %sum.013, 255
+  %add = add nuw nsw i32 %conv, %conv4
+  %add5 = add nuw nsw i32 %add, %conv3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
new file mode 100644
index 00000000000..bd69d27d2d5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -mtriple x86_64 -debug -disable-output 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Check that cost model is not executed twice for VF=2 when vectorization is
+; forced for a particular loop.
+
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
+; CHECK: LV: Vector loop of width 2 costs: {{[0-9]+}}.
+
+define i32 @foo(i32* %A, i32 %n) {
+entry:
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %exit, label %for.body.i
+
+for.body.i:
+  %iv = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %ld_addr = getelementptr inbounds i32, i32* %A, i32 %iv
+  %0 = load i32, i32* %ld_addr, align 4
+  %val = add i32 %0, 1
+  store i32 %val, i32* %ld_addr, align 4
+  %add.i = add nsw i32 %iv, 1
+  %cmp.i = icmp eq i32 %add.i, %n
+  br i1 %cmp.i, label %exit, label %for.body.i, !llvm.loop !0
+
+exit:
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
new file mode 100644
index 00000000000..8205092deff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
@@ -0,0 +1,134 @@
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Test that the register usage estimation is not affected by the presence of
+; debug intrinsics.
+;
+; In the test below the values %0 and %r.08 are ended in the add instruction
+; preceding the call to the intrinsic, and will be recorded against the index
+; of the call instruction.  This means the debug intrinsic must be considered
+; when erasing instructions from the list of open-intervals.
+;
+; Tests generated from following source (with and without -g):
+
+; unsigned test(unsigned *a, unsigned n) {
+;   unsigned i, r = 0;
+;   for(i = 0; i < n; i++)
+;     r += a[i];
+;   return r;
+; }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: LV: Checking a loop in "test_g"
+; CHECK: LV(REG): Found max usage: 2
+
+define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !12, metadata !16), !dbg !17
+  tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !13, metadata !16), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
+  %cmp6 = icmp eq i32 %n, 0, !dbg !21
+  br i1 %cmp6, label %for.end, label %for.body.preheader, !dbg !25
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64, !dbg !21
+  br label %for.body, !dbg !27
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !27
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !27, !tbaa !28
+  %add = add i32 %0, %r.08, !dbg !32
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !21
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !25, !llvm.loop !35
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !38
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %r.0.lcssa, !dbg !38
+}
+
+; CHECK: LV: Checking a loop in "test"
+; CHECK: LV(REG): Found max usage: 2
+
+define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr {
+entry:
+  %cmp6 = icmp eq i32 %n, 0
+  br i1 %cmp6, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !28
+  %add = add i32 %0, %r.08
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %r.0.lcssa
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "test_g", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !10, !9}
+!9 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
+!11 = !{!12, !13, !14, !15}
+!12 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 1, type: !10)
+!13 = !DILocalVariable(name: "n", arg: 2, scope: !6, file: !1, line: 1, type: !9)
+!14 = !DILocalVariable(name: "i", scope: !6, file: !1, line: 2, type: !9)
+!15 = !DILocalVariable(name: "r", scope: !6, file: !1, line: 2, type: !9)
+!16 = !DIExpression()
+!17 = !DILocation(line: 1, column: 27, scope: !6)
+!18 = !DILocation(line: 1, column: 39, scope: !6)
+!19 = !DILocation(line: 2, column: 15, scope: !6)
+!20 = !DILocation(line: 2, column: 12, scope: !6)
+!21 = !DILocation(line: 3, column: 16, scope: !22)
+!22 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 1)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 3, column: 3)
+!24 = distinct !DILexicalBlock(scope: !6, file: !1, line: 3, column: 3)
+!25 = !DILocation(line: 3, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !1, discriminator: 1)
+!27 = !DILocation(line: 4, column: 10, scope: !23)
+!28 = !{!29, !29, i64 0}
+!29 = !{!"int", !30, i64 0}
+!30 = !{!"omnipotent char", !31, i64 0}
+!31 = !{!"Simple C/C++ TBAA"}
+!32 = !DILocation(line: 4, column: 7, scope: !23)
+!33 = !DILocation(line: 3, column: 22, scope: !34)
+!34 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 2)
+!35 = distinct !{!35, !36, !37}
+!36 = !DILocation(line: 3, column: 3, scope: !24)
+!37 = !DILocation(line: 4, column: 13, scope: !24)
+!38 = !DILocation(line: 5, column: 3, scope: !6)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
new file mode 100644
index 00000000000..9b276aa2bd7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
@@ -0,0 +1,135 @@
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F
+; REQUIRES: asserts
+
+@a = global [1024 x i8] zeroinitializer, align 16
+@b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @foo() {
+; This function has a loop of SAD pattern. Here we check when VF = 16 the
+; register usage doesn't exceed 16.
+;
+; CHECK-LABEL: foo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %2 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %2, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @goo() {
+; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
+; it will not have vector version and the vector register usage will not exceed the
+; available vector register number.
+; CHECK-LABEL: goo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %tmp1 = add nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1
+  %tmp = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %tmp to i32
+  %tmp2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2
+  %tmp3 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %tmp3 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %tmp4 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %tmp4, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i64 @bar(i64* nocapture %a) {
+; CHECK-LABEL: bar
+; CHECK:       LV(REG): VF = 2
+; CHECK:       LV(REG): Found max usage: 3
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add2.lcssa = phi i64 [ %add2, %for.body ]
+  ret i64 %add2.lcssa
+
+for.body:
+  %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.012
+  store i64 %add, i64* %arrayidx, align 8
+  %add2 = add nsw i64 %add, %s.011
+  %inc = add nuw nsw i64 %i.012, 1
+  %exitcond = icmp eq i64 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+@d = external global [0 x i64], align 8
+@e = external global [0 x i32], align 4
+@c = external global [0 x i32], align 4
+
+define void @hoo(i32 %n) {
+; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
+; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
+; so the max usage of AVX512 vector register will be 2.
+; AVX512F-LABEL: bar
+; AVX512F:       LV(REG): VF = 16
+; AVX512F:       LV(REG): Found max usage: 2
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv
+  store i32 %tmp1, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll b/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
new file mode 100644
index 00000000000..1add87db611
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -instcombine -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1() {
+entry:
+  %alloca = alloca float, align 4
+  br label %loop_exit.dim.11.critedge
+
+loop_exit.dim.11.critedge:                        ; preds = %loop_body.dim.0
+  %ptrint = ptrtoint float* %alloca to i64
+  %maskedptr = and i64 %ptrint, 4
+  %maskcond = icmp eq i64 %maskedptr, 0
+  br label %loop_header.dim.017.preheader
+
+loop_header.dim.017.preheader:                    ; preds = %loop_exit.dim.016, %loop_exit.dim.11.critedge
+  br label %loop_body.dim.018
+
+loop_body.dim.018:                                ; preds = %loop_body.dim.018, %loop_header.dim.017.preheader
+  %invar_address.dim.019.0135 = phi i64 [ 0, %loop_header.dim.017.preheader ], [ %0, %loop_body.dim.018 ]
+  call void @llvm.assume(i1 %maskcond)
+; CHECK:     call void @llvm.assume(
+; CHECK-NOT: call void @llvm.assume(
+  %0 = add nuw nsw i64 %invar_address.dim.019.0135, 1
+  %1 = icmp eq i64 %0, 256
+  br i1 %1, label %loop_header.dim.017.preheader, label %loop_body.dim.018
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
new file mode 100755
index 00000000000..aff372b562f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; This test checks vector GEP before scatter.
+; The code bellow crashed due to destroyed SSA while incorrect vectorization of
+; the GEP.
+
+@d = global [10 x [10 x i32]] zeroinitializer, align 16
+@c = external global i32, align 4
+@a = external global i32, align 4
+@b = external global i64, align 8
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @_Z3fn1v() #0 {
+; CHECK-LABEL: @_Z3fn1v(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+entry:
+  %0 = load i32, i32* @c, align 4
+  %cmp34 = icmp sgt i32 %0, 8
+  br i1 %cmp34, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = load i32, i32* @a, align 4
+  %tobool = icmp eq i32 %1, 0
+  %2 = load i64, i64* @b, align 8
+  %mul = mul i64 %2, 4063299859190
+  %tobool6 = icmp eq i64 %mul, 0
+  %3 = sext i32 %0 to i64
+  br i1 %tobool, label %for.body.us.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.body.lr.ph
+  br label %for.body
+
+for.body.us.preheader:                            ; preds = %for.body.lr.ph
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond.cleanup4.us-lcssa.us.us
+  %indvars.iv78 = phi i64 [ %indvars.iv.next79, %for.cond.cleanup4.us-lcssa.us.us ], [ 8, %for.body.us.preheader ]
+  %indvars.iv70 = phi i64 [ %indvars.iv.next71, %for.cond.cleanup4.us-lcssa.us.us ], [ 0, %for.body.us.preheader ]
+  %4 = sub nsw i64 8, %indvars.iv78
+  %add.ptr.us = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 %indvars.iv78
+  %5 = add nsw i64 %4, %indvars.iv70
+  %arraydecay.us.us.us = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %5, i64 0
+  br i1 %tobool6, label %for.body5.us.us.us.preheader, label %for.body5.us.us48.preheader
+
+for.body5.us.us48.preheader:                      ; preds = %for.body.us
+  store i32 8, i32* %arraydecay.us.us.us, align 16
+  %indvars.iv.next66 = or i64 %indvars.iv70, 1
+  %6 = add nsw i64 %4, %indvars.iv.next66
+  %arraydecay.us.us55.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %6, i64 0
+  store i32 8, i32* %arraydecay.us.us55.1, align 8
+  br label %for.cond.cleanup4.us-lcssa.us.us
+
+for.body5.us.us.us.preheader:                     ; preds = %for.body.us
+  store i32 7, i32* %arraydecay.us.us.us, align 16
+  %indvars.iv.next73 = or i64 %indvars.iv70, 1
+  %7 = add nsw i64 %4, %indvars.iv.next73
+  %arraydecay.us.us.us.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %7, i64 0
+  store i32 7, i32* %arraydecay.us.us.us.1, align 8
+  br label %for.cond.cleanup4.us-lcssa.us.us
+
+for.cond.cleanup4.us-lcssa.us.us:                 ; preds = %for.body5.us.us48.preheader, %for.body5.us.us.us.preheader
+  %indvars.iv.next79 = add nuw nsw i64 %indvars.iv78, 2
+  %cmp.us = icmp slt i64 %indvars.iv.next79, %3
+  %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 2
+  br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup4.us-lcssa.us.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit99:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit99, %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv95 = phi i64 [ %indvars.iv.next96, %for.body ], [ 8, %for.body.preheader ]
+  %indvars.iv87 = phi i64 [ %indvars.iv.next88, %for.body ], [ 0, %for.body.preheader ]
+  %8 = sub nsw i64 8, %indvars.iv95
+  %add.ptr = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 %indvars.iv95
+  %9 = add nsw i64 %8, %indvars.iv87
+  %arraydecay.us31 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr, i64 %9, i64 0
+  store i32 8, i32* %arraydecay.us31, align 16
+  %indvars.iv.next90 = or i64 %indvars.iv87, 1
+  %10 = add nsw i64 %8, %indvars.iv.next90
+  %arraydecay.us31.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr, i64 %10, i64 0
+  store i32 8, i32* %arraydecay.us31.1, align 8
+  %indvars.iv.next96 = add nuw nsw i64 %indvars.iv95, 2
+  %cmp = icmp slt i64 %indvars.iv.next96, %3
+  %indvars.iv.next88 = add nuw nsw i64 %indvars.iv87, 2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
new file mode 100644
index 00000000000..cd3e89ae735
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
+; REQUIRES: asserts
+; This test should not be vectorized in X86\SLM arch
+; Vectorizing the 64bit multiply in this case is wrong since
+; it can be done with a lower bit mode (notice that the sources is 16bit)
+; Also addq\subq (quad word) has a high cost on SLM arch.
+; this test has a bad performance (regression of -70%) if vectorized on SLM arch
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
+entry:
+; MSG: LV: Selecting VF: 1. 
+  %cmp17 = icmp sgt i32 %LastIndex, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv5 = sext i16 %Scale to i64
+  %sh_prom = and i64 %conv5, 4294967295
+  %0 = sext i16 %lag to i64
+  %wide.trip.count = zext i32 %LastIndex to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %conv8 = trunc i64 %add7 to i32
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
+  ret i32 %Accumulator.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %1 to i64
+  %2 = add nsw i64 %indvars.iv, %0
+  %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
+  %3 = load i16, i16* %arrayidx3, align 2 
+  %conv4 = sext i16 %3 to i64
+  %mul = mul nsw i64 %conv4, %conv
+  %shr = ashr i64 %mul, %sh_prom
+  %add7 = add i64 %shr, %Accumulator.018 
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
new file mode 100644
index 00000000000..e162a3a6b2b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+; We can optimize this test without a tail.
+define void @example1() optsize {
+; CHECK-LABEL: @example1(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:         br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Can vectorize in 'optsize' mode by masking the needed tail.
+define void @example2(i32 %n, i32 %x) optsize {
+; CHECK-LABEL: @example2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]]
+; CHECK:       .lr.ph5.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP10]], align 16
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP14]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP16]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret void
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32, i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; N is unknown, we need a tail. Can't vectorize because loop has no primary
+; induction.
+;CHECK-LABEL: @example3(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32, i32* %.023, i64 1
+  %4 = load i32, i32* %.023, align 16
+  %5 = getelementptr inbounds i32, i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; We can't vectorize this one because we need a runtime ptr check.
+;CHECK-LABEL: @example23(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; We CAN vectorize this example because the pointers are marked as noalias.
+define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23b(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN vectorize this example by folding the tail it entails.
+define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23c(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], <i64 257, i64 257, i64 257, i64 257>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; CHECK:       pred.load.continue16:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]]
+; CHECK-NEXT:    store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP33:%.*]]
+; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN'T vectorize this example because it would entail a tail and an
+; induction is used outside the loop.
+define i64 @example23d(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+;CHECK-LABEL: @example23d(
+; CHECK-NOT: <4 x
+; CHECK: ret i64
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i64 %7
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
new file mode 100644
index 00000000000..645f3360543
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -0,0 +1,54 @@
+; This test checks that the given loop still beneficial for vecotization
+; even if it contains scalarized load (gather on AVX2)
+;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+  %idxprom = sext i32 %i to i64
+  %idxprom5 = sext i32 %j to i64
+  br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add7
+
+  for.body:                                         ; preds = %for.body, %entry
+  ; the loop gets vectorized
+  ; first consecutive load as vector load
+  ; CHECK: %wide.load = load <8 x i32>
+  ; second strided load scalarized
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1
+  %mul = mul nsw i32 %1, %0
+  %add = add i32 %sum.015, 4
+  %add7 = add i32 %add, %mul
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/struct-store.ll b/llvm/test/Transforms/LoopVectorize/X86/struct-store.ll
new file mode 100644
index 00000000000..4ff3b0e4c05
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/struct-store.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S
+
+; Make sure we are not crashing on this one.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glbl = external global [16 x { i64, i64 }], align 16
+
+declare void @fn()
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
+  %tmp = getelementptr inbounds [16 x { i64, i64 }], [16 x { i64, i64 }]* @glbl, i64 0, i64 %indvars.iv
+  store { i64, i64 } { i64 ptrtoint (void ()* @fn to i64), i64 0 }, { i64, i64 }* %tmp, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16
+  br i1 %exitcond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
new file mode 100644
index 00000000000..5a4bfe5e6bd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -0,0 +1,187 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -S < %s | FileCheck %s
+
+; Test to verify that when math headers are built with
+; __FINITE_MATH_ONLY__ enabled, causing use of __<func>_finite
+; function versions, vectorization can map these to vector versions.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare float @__expf_finite(float) #0
+
+; CHECK-LABEL: @exp_f32
+; CHECK: <4 x float> @__svml_expf4
+; CHECK: ret
+define void @exp_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__expf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__exp_finite(double) #0
+
+; CHECK-LABEL: @exp_f64
+; CHECK: <4 x double> @__svml_exp4
+; CHECK: ret
+define void @exp_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__exp_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.vectorize.width", i32 4}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+
+
+declare float @__logf_finite(float) #0
+
+; CHECK-LABEL: @log_f32
+; CHECK: <4 x float> @__svml_logf4
+; CHECK: ret
+define void @log_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__logf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!21 = distinct !{!21, !22, !23}
+!22 = !{!"llvm.loop.vectorize.width", i32 4}
+!23 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__log_finite(double) #0
+
+; CHECK-LABEL: @log_f64
+; CHECK: <4 x double> @__svml_log4
+; CHECK: ret
+define void @log_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__log_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!31 = distinct !{!31, !32, !33}
+!32 = !{!"llvm.loop.vectorize.width", i32 4}
+!33 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare float @__powf_finite(float, float) #0
+
+; CHECK-LABEL: @pow_f32
+; CHECK: <4 x float> @__svml_powf4
+; CHECK: ret
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %indvars.iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call fast float @__powf_finite(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!41 = distinct !{!41, !42, !43}
+!42 = !{!"llvm.loop.vectorize.width", i32 4}
+!43 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__pow_finite(double, double) #0
+
+; CHECK-LABEL: @pow_f64
+; CHECK: <4 x double> @__svml_pow4
+; CHECK: ret
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %indvars.iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call fast double @__pow_finite(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!51 = distinct !{!51, !52, !53}
+!52 = !{!"llvm.loop.vectorize.width", i32 4}
+!53 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
new file mode 100644
index 00000000000..8ff62f178d0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
@@ -0,0 +1,501 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+
+define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.sin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.sin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.cos.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @pow(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @powf(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.exp.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.exp.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.log.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.log.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll
new file mode 100644
index 00000000000..c0bbb92c2c5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mcpu=prescott < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd11.0"
+
+@big = external global [0 x i32]
+
+; PR18049
+; We need to truncate the exit count to i32. This is legal because the
+; arithmetic is signed (%inc is nsw).
+
+; CHECK-LABEL: tripcount
+; CHECK: trunc i64 %count to i32
+
+define void @tripcount(i64 %count) {
+entry:
+  %cmp6 = icmp sgt i64 %count, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @big, i32 0, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %neg = xor i32 %0, -1
+  store i32 %neg, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.07, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %count
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
new file mode 100644
index 00000000000..e08ef002d0e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 4
+  %conv = uitofp i64 %tmp to double
+  %arrayidx2 = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  store double %conv, double* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
new file mode 100644
index 00000000000..2be565e7110
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
@@ -0,0 +1,99 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: test
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
+
+define void @test(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp0 = load float, float* %arrayidx, align 4
+  %add = fadd float %tmp0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK-LABEL: foo
+; CHECK-DAG: LV: Found uniform instruction:   %cond = icmp eq i64 %i.next, %n
+; CHECK-DAG: LV: Found uniform instruction:   %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0
+; CHECK-NOT: LV: Found uniform instruction:   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+
+define void @foo(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = trunc i64 %i to i32
+  %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0
+  store i32 %tmp0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: goo
+; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop.
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
+
+define i64 @goo(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp0 = load float, float* %arrayidx, align 4
+  %add = fadd float %tmp0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %retval = add i64 %indvars.iv, %indvars.iv.next
+  ret i64 %retval
+}
+
+; CHECK-LABEL: PR38786
+; Check that first order recurrence phis (%phi32 and %phi64) are not uniform.
+; CHECK-NOT: LV: Found uniform instruction:   %phi
+define void @PR38786(double* %y, double* %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %phi32 = phi i32 [ 0, %entry ], [ %i32next, %for.body ]
+  %phi64 = phi i64 [ 0, %entry ], [ %i64next, %for.body ]
+  %i32next = add i32 %phi32, 1
+  %i64next = zext i32 %i32next to i64
+  %xip = getelementptr inbounds double, double* %x, i64 %i64next
+  %yip = getelementptr inbounds double, double* %y, i64 %phi64
+  %xi = load double, double* %xip, align 8
+  store double %xi, double* %yip, align 8
+  %cmp = icmp slt i64 %i64next, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
new file mode 100644
index 00000000000..e71292265c2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
@@ -0,0 +1,47 @@
+; RUN: opt -basicaa -loop-vectorize -S -mcpu=core-avx2 < %s | FileCheck %s
+
+;float inc = 0.5;
+;void foo(float *A, unsigned N) {
+;
+;  for (unsigned i=0; i<N; i++){
+;    A[i] += inc;
+;  }
+;}
+
+; CHECK-LABEL: foo
+; CHECK: vector.body
+; CHECK: load <8 x float>
+; CHECK: fadd <8 x float>
+; CHECK: store <8 x float>
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@inc = global float 5.000000e-01, align 4
+
+define void @foo(float* nocapture %A, i32 %N) #0 {
+entry:
+  %cmp3 = icmp eq i32 %N, 0
+  br i1 %cmp3, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %0 = load float, float* @inc, align 4
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
new file mode 100644
index 00000000000..dc23ea8efdf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -0,0 +1,23 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: "foo"
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
+entry:  
+  br label %body
+
+body:
+  %i = phi i64 [ 0, %entry ], [ %next, %body ]
+  %ptr = getelementptr inbounds i32, i32* %p, i64 %i
+  %val = load i32, i32* %ptr, align 4
+  %shift = ashr i32 %val, %k
+  store i32 %shift, i32* %ptr, align 4
+  %next = add nuw nsw i64 %i, 1
+  %cmp = icmp eq i64 %next, 16
+  br i1 %cmp, label %exit, label %body
+
+exit:
+  ret void
+
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
new file mode 100644
index 00000000000..52914b6a7c6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -disable-loop-unrolling -S | FileCheck %s -check-prefix=CHECK-NOUNRL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK-LABEL: @bar(
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+;CHECK-NOUNRL-LABEL: @bar(
+;CHECK-NOUNRL: store <4 x i32>
+;CHECK-NOUNRL-NOT: store <4 x i32>
+;CHECK-NOUNRL: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
new file mode 100644
index 00000000000..69d2a319a8c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-VECTOR
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-SCALAR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; We don't unroll this loop because it has a small constant trip count.
+;
+; CHECK-VECTOR-LABEL: @foo(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo(
+; CHECK-SCALAR: load i32, i32*
+; CHECK-SCALAR-NOT: load i32, i32*
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo(i32* nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; But this is a good small loop to unroll as we don't know of a bound on its
+; trip count.
+;
+; CHECK-VECTOR-LABEL: @bar(
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; For x86, loop unroll in loop vectorizer is disabled when VF==1.
+;
+; CHECK-SCALAR-LABEL: @bar(
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+; Also unroll if we need a runtime check but it was going to be added for
+; vectorization anyways.
+; CHECK-VECTOR-LABEL: @runtime_chk(
+; CHECK-VECTOR: store <4 x float>
+; CHECK-VECTOR: store <4 x float>
+;
+; But not if the unrolling would introduce the runtime check.
+; CHECK-SCALAR-LABEL: @runtime_chk(
+; CHECK-SCALAR: store float
+; CHECK-SCALAR-NOT: store float
+define void @runtime_chk(float* %A, float* %B, float %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, %N
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll_selection.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll_selection.ll
new file mode 100644
index 00000000000..71b829071e2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll_selection.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't unroll when we have register pressure.
+;CHECK: reg_pressure
+;CHECK: load <4 x double>
+;CHECK-NOT: load  <4 x double>
+;CHECK: store <4 x double>
+;CHECK-NOT: store <4 x double>
+;CHECK: ret
+define void @reg_pressure(double* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = sext i32 %n to i64
+  br label %2
+
+; <label>:2                                       ; preds = %2, %0
+  %indvars.iv = phi i64 [ %indvars.iv.next, %2 ], [ %1, %0 ]
+  %3 = getelementptr inbounds double, double* %A, i64 %indvars.iv
+  %4 = load double, double* %3, align 8
+  %5 = fadd double %4, 3.000000e+00
+  %6 = fmul double %4, 2.000000e+00
+  %7 = fadd double %5, %6
+  %8 = fadd double %7, 2.000000e+00
+  %9 = fmul double %8, 5.000000e-01
+  %10 = fadd double %6, %9
+  %11 = fsub double %10, %5
+  %12 = fadd double %4, %11
+  %13 = fdiv double %8, %12
+  %14 = fmul double %13, %8
+  %15 = fmul double %6, %14
+  %16 = fmul double %5, %15
+  %17 = fadd double %16, -3.000000e+00
+  %18 = fsub double %4, %5
+  %19 = fadd double %6, %18
+  %20 = fadd double %13, %19
+  %21 = fadd double %20, %17
+  %22 = fadd double %21, 3.000000e+00
+  %23 = fmul double %4, %22
+  store double %23, double* %3, align 8
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %24 = trunc i64 %indvars.iv to i32
+  %25 = icmp eq i32 %24, 0
+  br i1 %25, label %26, label %2
+
+; <label>:26                                      ; preds = %2
+  ret void
+}
+
+; This is a small loop. Unroll it twice. 
+;CHECK: small_loop
+;CHECK: xor
+;CHECK: xor
+;CHECK: ret
+define void @small_loop(i16* nocapture %A, i64 %n) nounwind uwtable ssp {
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.01 = phi i64 [ %5, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i16, i16* %A, i64 %i.01
+  %3 = load i16, i16* %2, align 2
+  %4 = xor i16 %3, 3
+  store i16 %4, i16* %2, align 2
+  %5 = add i64 %i.01, 1
+  %exitcond = icmp eq i64 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
new file mode 100644
index 00000000000..6f8f5223ce4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
@@ -0,0 +1,632 @@
+; RUN: opt < %s -vector-library=Accelerate -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK-LABEL: @sqrt_f32(
+;CHECK: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sqrtf(float) nounwind readnone
+define void @sqrt_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @exp_f32(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @expf(float) nounwind readnone
+define void @exp_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @expf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log_f32(
+;CHECK: vlogf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @logf(float) nounwind readnone
+define void @log_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @logf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; For abs instruction we'll generate vector intrinsic, as it's cheaper than a lib call.
+;CHECK-LABEL: @fabs_f32(
+;CHECK: fabs{{.*}}<4 x float>
+;CHECK: ret void
+declare float @fabsf(float) nounwind readnone
+define void @fabs_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @fabsf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we can vectorize an intrinsic into a vector call.
+;CHECK-LABEL: @exp_f32_intrin(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.exp.f32(float) nounwind readnone
+define void @exp_f32_intrin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize arbitrary functions.
+;CHECK-LABEL: @foo_f32(
+;CHECK-NOT: foo{{.*}}<4 x float>
+;CHECK: ret void
+declare float @foo(float) nounwind readnone
+define void @foo_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @foo(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize calls with nobuiltin attribute.
+;CHECK-LABEL: @sqrt_f32_nobuiltin(
+;CHECK-NOT: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+define void @sqrt_f32_nobuiltin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone nobuiltin
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @ceil_f32(
+;CHECK: vceilf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @ceilf(float) nounwind readnone
+define void @ceil_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @ceilf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @floor_f32(
+;CHECK: vfloorf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @floorf(float) nounwind readnone
+define void @floor_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @floorf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @expm1_f32(
+;CHECK: vexpm1f{{.*}}<4 x float>
+;CHECK: ret void
+declare float @expm1f(float) nounwind readnone
+define void @expm1_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @expm1f(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log1p_f32(
+;CHECK: vlog1pf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @log1pf(float) nounwind readnone
+define void @log1p_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @log1pf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log10_f32(
+;CHECK: vlog10f{{.*}}<4 x float>
+;CHECK: ret void
+declare float @log10f(float) nounwind readnone
+define void @log10_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @log10f(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @logb_f32(
+;CHECK: vlogbf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @logbf(float) nounwind readnone
+define void @logb_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @logbf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @sin_f32(
+;CHECK: vsinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sinf(float) nounwind readnone
+define void @sin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sinf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @cos_f32(
+;CHECK: vcosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @cosf(float) nounwind readnone
+define void @cos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @cosf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @tan_f32(
+;CHECK: vtanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @tanf(float) nounwind readnone
+define void @tan_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @tanf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @asin_f32(
+;CHECK: vasinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @asinf(float) nounwind readnone
+define void @asin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @asinf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @acos_f32(
+;CHECK: vacosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @acosf(float) nounwind readnone
+define void @acos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @acosf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @atan_f32(
+;CHECK: vatanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @atanf(float) nounwind readnone
+define void @atan_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @atanf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @sinh_f32(
+;CHECK: vsinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sinhf(float) nounwind readnone
+define void @sinh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sinhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @cosh_f32(
+;CHECK: vcoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @coshf(float) nounwind readnone
+define void @cosh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @coshf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @tanh_f32(
+;CHECK: vtanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @tanhf(float) nounwind readnone
+define void @tanh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @tanhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @asinh_f32(
+;CHECK: vasinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @asinhf(float) nounwind readnone
+define void @asinh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @asinhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @acosh_f32(
+;CHECK: vacoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @acoshf(float) nounwind readnone
+define void @acosh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @acoshf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @atanh_f32(
+;CHECK: vatanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @atanhf(float) nounwind readnone
+define void @atanh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @atanhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
new file mode 100644
index 00000000000..90114672c8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Loop hints: force=enabled
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; The source code for the test:
+;
+; #include <math.h>
+; void foo(float* restrict A, float * restrict B)
+; {
+;   for (int i = 0; i < 1000; i+=2) A[i] = sinf(B[i]);
+; }
+;
+
+;
+; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata.
+;
+
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.access.group !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !{!"llvm.loop.parallel_accesses", !11}}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!11 = distinct !{}
+
+;
+; This method will not be vectorized, as scalar cost is lower than any of vector costs.
+;
+
+define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare float @llvm.sin.f32(float) nounwind readnone
+
+; Dummy metadata
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!13 = distinct !{}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
new file mode 100644
index 00000000000..7c249a1b422
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S -vectorizer-min-trip-count=21 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+;
+; The source code for the test:
+;
+; void foo(float* restrict A, float* restrict B)
+; {
+;     for (int i = 0; i < 20; ++i) A[i] += B[i];
+; }
+;
+
+;
+; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
+;
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !4
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !11
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !{!"llvm.loop.parallel_accesses", !11}}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!11 = distinct !{}
+
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed thanks to folding its tail.
+;
+define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!13 = distinct !{}
+
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed.
+;
+define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!4 = !{!4}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
new file mode 100644
index 00000000000..5b3f5c58c79
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -mattr=+sse4.2 -debug-only=loop-vectorize 2>&1 -S | FileCheck %s
+; REQUIRES: asserts
+; Make sure we use the right select kind when querying select costs.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+; CHECK: Checking a loop in "scalarselect"
+define void @scalarselect(i1 %cond) {
+  br label %1
+
+; <label>:1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+
+; A scalar select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
+
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8
+  ret void
+}
+
+; CHECK: Checking a loop in "vectorselect"
+define void @vectorselect(i1 %cond) {
+  br label %1
+
+; <label>:1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %8 = icmp ult i64 %indvars.iv, 8
+
+; A vector select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
+
+  %sel = select i1 %8, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
new file mode 100644
index 00000000000..2d2082eec3a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
@@ -0,0 +1,75 @@
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=core-avx2 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX2
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global [1000 x i8] zeroinitializer, align 16
+@b = global [1000 x i8] zeroinitializer, align 16
+@c = global [1000 x i8] zeroinitializer, align 16
+@u = global [1000 x i32] zeroinitializer, align 16
+@v = global [1000 x i32] zeroinitializer, align 16
+@w = global [1000 x i32] zeroinitializer, align 16
+
+; Tests that the vectorization factor is determined by the smallest instead of
+; widest type in the loop for maximum bandwidth when
+; -vectorizer-maximize-bandwidth is indicated.
+;
+; CHECK-LABEL: foo
+; CHECK-AVX1: LV: Selecting VF: 16.
+; CHECK-AVX2: LV: Selecting VF: 32.
+define void @foo() {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx6, align 1
+  %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %2
+  %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv
+  store i32 %add11, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; We should not choose a VF larger than the constant TC.
+; VF chosen should be atmost 16 (not the max possible vector width = 32 for AVX2)
+define void @not_too_small_tc(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: not_too_small_tc
+; CHECK-AVX2: LV: Selecting VF: 16.
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv
+  %l1 = load i8, i8* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %l2 = load i8, i8* %arrayidx2, align 4, !llvm.access.group !13
+  %add = add i8 %l1, %l2
+  store i8 %add, i8* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!4 = !{!4}
+!13 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
new file mode 100644
index 00000000000..cca829b9457
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
@@ -0,0 +1,150 @@
+; RUN: opt -basicaa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%0 = type { %0*, %1 }
+%1 = type { i8*, i32 }
+
+@p = global [2048 x [8 x i32*]] zeroinitializer, align 16
+@q = global [2048 x i16] zeroinitializer, align 16
+@r = global [2048 x i16] zeroinitializer, align 16
+
+; Tests for widest type
+; Ensure that we count the pointer store in the first test case. We have a
+; consecutive vector of pointers store, therefore we should count it towards the
+; widest vector count.
+;
+; CHECK: test_consecutive_store
+; CHECK: The Smallest and Widest types: 64 / 64 bits.
+define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
+  %4 = load %0*, %0** %2, align 8
+  %5 = icmp eq %0** %0, %1
+  br i1 %5, label %12, label %6
+
+; <label>:6                                       ; preds = %3
+  br label %7
+
+; <label>:7                                       ; preds = %7, %6
+  %8 = phi %0** [ %0, %6 ], [ %9, %7 ]
+  store %0* %4, %0** %8, align 8
+  %9 = getelementptr inbounds %0*, %0** %8, i64 1
+  %10 = icmp eq %0** %9, %1
+  br i1 %10, label %11, label %7
+
+; <label>:11                                      ; preds = %7
+  br label %12
+
+; <label>:12                                      ; preds = %11, %3
+  ret void
+}
+
+; However, if the store of a set of pointers is not to consecutive memory we do
+; NOT count the store towards the widest vector type.
+; In the test case below we add i16 types to store it in an array of pointer,
+; therefore the widest type should be i16.
+; int* p[2048][8];
+; short q[2048];
+;   for (int y = 0; y < 8; ++y)
+;     for (int i = 0; i < 1024; ++i) {
+;       p[i][y] = (int*) (1 + q[i]);
+;     }
+; CHECK: test_nonconsecutive_store
+; CHECK: The Smallest and Widest types: 16 / 16 bits.
+define void @test_nonconsecutive_store() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %14, %0
+  %2 = phi i64 [ 0, %0 ], [ %15, %14 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %11, %3 ]
+  %5 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q, i64 0, i64 %4
+  %6 = load i16, i16* %5, align 2
+  %7 = sext i16 %6 to i64
+  %8 = add i64 %7, 1
+  %9 = inttoptr i64 %8 to i32*
+  %10 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p, i64 0, i64 %4, i64 %2
+  store i32* %9, i32** %10, align 8
+  %11 = add i64 %4, 1
+  %12 = trunc i64 %11 to i32
+  %13 = icmp ne i32 %12, 1024
+  br i1 %13, label %3, label %14
+
+; <label>:14                                      ; preds = %3
+  %15 = add i64 %2, 1
+  %16 = trunc i64 %15 to i32
+  %17 = icmp ne i32 %16, 8
+  br i1 %17, label %1, label %18
+
+; <label>:18                                      ; preds = %14
+  ret void
+}
+
+
+@ia = global [1024 x i32*] zeroinitializer, align 16
+@ib = global [1024 x i32] zeroinitializer, align 16
+@ic = global [1024 x i8] zeroinitializer, align 16
+@p2 = global [2048 x [8 x i32*]] zeroinitializer, align 16
+@q2 = global [2048 x i16] zeroinitializer, align 16
+
+;; Now we check the same rules for loads. We should take consecutive loads of
+;; pointer types into account.
+; CHECK: test_consecutive_ptr_load
+; CHECK: The Smallest and Widest types: 8 / 64 bits.
+define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %2 = phi i64 [ 0, %0 ], [ %10, %1 ]
+  %3 = phi i8 [ 0, %0 ], [ %9, %1 ]
+  %4 = getelementptr inbounds [1024 x i32*], [1024 x i32*]* @ia, i32 0, i64 %2
+  %5 = load i32*, i32** %4, align 4
+  %6 = ptrtoint i32* %5 to i64
+  %7 = trunc i64 %6 to i8
+  %8 = add i8 %3, 1
+  %9 = add i8 %7, %8
+  %10 = add i64 %2, 1
+  %11 = icmp ne i64 %10, 1024
+  br i1 %11, label %1, label %12
+
+; <label>:12                                      ; preds = %1
+  %13 = phi i8 [ %9, %1 ]
+  ret i8 %13
+}
+
+;; However, we should not take unconsecutive loads of pointers into account.
+; CHECK: test_nonconsecutive_ptr_load
+; CHECK: LV: The Smallest and Widest types: 16 / 16 bits.
+define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %13, %0
+  %2 = phi i64 [ 0, %0 ], [ %14, %13 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %10, %3 ]
+  %5 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p2, i64 0, i64 %4, i64 %2
+  %6 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q2, i64 0, i64 %4
+  %7 = load i32*, i32** %5, align 2
+  %8 = ptrtoint i32* %7 to i64
+  %9 = trunc i64 %8 to i16
+  store i16 %9, i16* %6, align 8
+  %10 = add i64 %4, 1
+  %11 = trunc i64 %10 to i32
+  %12 = icmp ne i32 %11, 1024
+  br i1 %12, label %3, label %13
+
+; <label>:13                                      ; preds = %3
+  %14 = add i64 %2, 1
+  %15 = trunc i64 %14 to i32
+  %16 = icmp ne i32 %15, 8
+  br i1 %16, label %1, label %17
+
+; <label>:17                                      ; preds = %13
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
new file mode 100644
index 00000000000..c38b638ed2b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
+
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
+; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, i32* %diff, align 4, !tbaa !11
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* %cb, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1, !tbaa !21
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [16 x i8], [16 x i8]* %cc, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1, !tbaa !21
+  %conv3 = sext i8 %1 to i32
+  %sub = sub i32 %conv, %conv3
+  %add = add nsw i32 %sub, %add8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !25
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, i32* %diff, align 4, !tbaa !11
+  call void @ibar(i32* %diff) #2
+  ret i32 0
+}
+
+declare void @ibar(i32*) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)
+!25 = !{!25, !15}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
new file mode 100644
index 00000000000..4aa96df94ff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -0,0 +1,313 @@
+; RUN: opt < %s -loop-vectorize -transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -transform-warning -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize'  2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; C/C++ code for tests
+; void test(int *A, int Length) {
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < Length; i++) {
+;     A[i] = i;
+;     if (A[i] > Length)
+;       break;
+;   }
+; }
+; File, line, and column should match those specified in the metadata
+; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:4:5: loop not vectorized
+
+; void test_disabled(int *A, int Length) {
+; #pragma clang loop vectorize(disable) interleave(disable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = i;
+; }
+; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+; void test_array_bounds(int *A, int *B, int Length) {
+; #pragma clang loop vectorize(enable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = A[B[i]];
+; }
+; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
+; CHECK: remark: source.cpp:19:5: loop not vectorized
+; CHECK: warning: source.cpp:19:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; int foo();
+; void test_multiple_failures(int *A) {
+;   int k = 0;
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < 1000; i+=A[i]) {
+;     if (A[i])
+;       k = foo();
+;   }
+;   return k;
+; }
+; CHECK: remark: source.cpp:29:7: loop not vectorized: control flow cannot be substituted for a select
+; CHECK: remark: source.cpp:27:3: loop not vectorized
+
+; YAML:       --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            AllDisabled
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 13, Column: 5 }
+; YAML-NEXT: Function:        _Z13test_disabledPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            ''
+; YAML-NEXT: Name:            CantIdentifyArrayBounds
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          cannot identify array bounds
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT:   - String:          ' (Force='
+; YAML-NEXT:   - Force:           'true'
+; YAML-NEXT:   - String:          ')'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Failure
+; YAML-NEXT: Pass:            transform-warning
+; YAML-NEXT: Name:            FailedRequestedVectorization
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NoCFGForSelect
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 29, Column: 7 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          control flow cannot be substituted for a select
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NonReductionValueUsedOutsideLoop
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          value that could not be identified as reduction is used outside the loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z4testPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp10 = icmp sgt i32 %Length, 0, !dbg !12
+  br i1 %cmp10, label %for.body, label %for.end, !dbg !12, !llvm.loop !14
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !16
+  %0 = trunc i64 %indvars.iv to i32, !dbg !16
+  %ld = load i32, i32* %arrayidx, align 4
+  store i32 %0, i32* %arrayidx, align 4, !dbg !16, !tbaa !18
+  %cmp3 = icmp sle i32 %ld, %Length, !dbg !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !12
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, %Length, !dbg !12
+  %or.cond = and i1 %cmp3, %cmp, !dbg !22
+  br i1 %or.cond, label %for.body, label %for.end, !dbg !22
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !24
+}
+
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 !dbg !7 {
+entry:
+  %cmp4 = icmp sgt i32 %Length, 0, !dbg !25
+  br i1 %cmp4, label %for.body, label %for.end, !dbg !25, !llvm.loop !27
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !30
+  %0 = trunc i64 %indvars.iv to i32, !dbg !30
+  store i32 %0, i32* %arrayidx, align 4, !dbg !30, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !25
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !25
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !25
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !25, !llvm.loop !27
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !31
+}
+
+; CHECK: _Z13test_disabledPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 !dbg !8 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32, !llvm.loop !34
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !35
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !34
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !36
+}
+
+; CHECK: _Z17test_array_boundsPiS_i
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind uwtable
+define i32 @test_multiple_failures(i32* nocapture readonly %A) #0 !dbg !46 {
+entry:
+  br label %for.body, !dbg !38
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.09 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+  %k.09 = phi i32 [ 0, %entry ], [ %k.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09, !dbg !40
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !40
+  %tobool = icmp eq i32 %0, 0, !dbg !40
+  br i1 %tobool, label %for.inc, label %if.then, !dbg !40
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (...) @foo(), !dbg !41
+  %.pre = load i32, i32* %arrayidx, align 4
+  br label %for.inc, !dbg !42
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %1 = phi i32 [ %.pre, %if.then ], [ 0, %for.body ], !dbg !43
+  %k.1 = phi i32 [ %call, %if.then ], [ %k.09, %for.body ]
+  %add = add nsw i32 %1, %i.09, !dbg !44
+  %cmp = icmp slt i32 %add, 1000, !dbg !45
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !38
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret i32 %k.1, !dbg !39
+}
+
+declare i32 @foo(...)
+
+; CHECK: test_multiple_failure
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 10, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 16, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.5.0"}
+!12 = !DILocation(line: 3, column: 8, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!14 = !{!14, !15, !15}
+!15 = !{!"llvm.loop.vectorize.enable", i1 true}
+!16 = !DILocation(line: 4, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 5, column: 9, scope: !23)
+!23 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !17)
+!24 = !DILocation(line: 8, column: 1, scope: !4)
+!25 = !DILocation(line: 12, column: 8, scope: !26)
+!26 = distinct !DILexicalBlock(line: 12, column: 3, file: !1, scope: !7)
+!27 = !{!27, !28, !29}
+!28 = !{!"llvm.loop.interleave.count", i32 1}
+!29 = !{!"llvm.loop.vectorize.width", i32 1}
+!30 = !DILocation(line: 13, column: 5, scope: !26)
+!31 = !DILocation(line: 14, column: 1, scope: !7)
+!32 = !DILocation(line: 18, column: 8, scope: !33)
+!33 = distinct !DILexicalBlock(line: 18, column: 3, file: !1, scope: !8)
+!34 = !{!34, !15}
+!35 = !DILocation(line: 19, column: 5, scope: !33)
+!36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, retainedNodes: !2)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
new file mode 100644
index 00000000000..ac0648a6838
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+
+; Verify analysis remarks are generated when interleaving is not beneficial.
+; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that vectorization is not beneficial
+; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that interleaving is not beneficial and is explicitly disabled or interleave count is set to 1
+; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that vectorization is not beneficial
+; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that interleaving is not beneficial
+
+; First loop.
+;  #pragma clang loop interleave(disable) unroll(disable)
+;  for(int i = 0; i < n; i++) {
+;    out[i] = *in[i];
+;  }
+
+; Second loop.
+;  #pragma clang loop unroll(disable)
+;  for(int i = 0; i < n; i++) {
+;    out[i] = *in[i];
+;  }
+
+; ModuleID = 'vectorization-remarks-profitable.ll'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind uwtable
+define void @do_not_interleave(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !4 {
+entry:
+  %cmp.4 = icmp eq i32 %size, 0, !dbg !10
+  br i1 %cmp.4, label %for.end, label %for.body.preheader, !dbg !11
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !12
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !12
+  %0 = bitcast float** %arrayidx to i32**, !dbg !12
+  %1 = load i32*, i32** %0, align 8, !dbg !12
+  %2 = load i32, i32* %1, align 4, !dbg !13
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !14
+  %3 = bitcast float* %arrayidx2 to i32*, !dbg !15
+  store i32 %2, i32* %3, align 4, !dbg !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !11
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !11
+  %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !11
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !11, !llvm.loop !16
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !19
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !19
+}
+
+; Function Attrs: nounwind uwtable
+define void @interleave_not_profitable(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !6 {
+entry:
+  %cmp.4 = icmp eq i32 %size, 0, !dbg !20
+  br i1 %cmp.4, label %for.end, label %for.body, !dbg !21
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !22
+  %0 = bitcast float** %arrayidx to i32**, !dbg !22
+  %1 = load i32*, i32** %0, align 8, !dbg !22
+  %2 = load i32, i32* %1, align 4, !dbg !23
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !24
+  %3 = bitcast float* %arrayidx2 to i32*, !dbg !25
+  store i32 %2, i32* %3, align 4, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !21
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !21, !llvm.loop !26
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !27
+}
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250016)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "vectorization-remarks-profitable.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "do_not_interleave", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = distinct !DISubprogram(name: "interleave_not_profitable", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.8.0 (trunk 250016)"}
+!10 = !DILocation(line: 4, column: 23, scope: !4)
+!11 = !DILocation(line: 4, column: 3, scope: !4)
+!12 = !DILocation(line: 5, column: 17, scope: !4)
+!13 = !DILocation(line: 5, column: 16, scope: !4)
+!14 = !DILocation(line: 5, column: 7, scope: !4)
+!15 = !DILocation(line: 5, column: 14, scope: !4)
+!16 = distinct !{!16, !17, !18}
+!17 = !{!"llvm.loop.interleave.count", i32 1}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 6, column: 1, scope: !4)
+!20 = !DILocation(line: 11, column: 23, scope: !6)
+!21 = !DILocation(line: 11, column: 3, scope: !6)
+!22 = !DILocation(line: 12, column: 17, scope: !6)
+!23 = !DILocation(line: 12, column: 16, scope: !6)
+!24 = !DILocation(line: 12, column: 7, scope: !6)
+!25 = !DILocation(line: 12, column: 14, scope: !6)
+!26 = distinct !{!26, !18}
+!27 = !DILocation(line: 13, column: 1, scope: !6)
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
new file mode 100644
index 00000000000..0345a3ba045
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
+
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
+; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, i32* %diff, align 4, !dbg !10, !tbaa !11
+  br label %for.body, !dbg !15
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ], !dbg !19
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* %cb, i64 0, i64 %indvars.iv, !dbg !19
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !19, !tbaa !21
+  %conv = sext i8 %0 to i32, !dbg !19
+  %arrayidx2 = getelementptr inbounds [16 x i8], [16 x i8]* %cc, i64 0, i64 %indvars.iv, !dbg !19
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !19, !tbaa !21
+  %conv3 = sext i8 %1 to i32, !dbg !19
+  %sub = sub i32 %conv, %conv3, !dbg !19
+  %add = add nsw i32 %sub, %add8, !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !15
+  %exitcond = icmp eq i64 %indvars.iv.next, 16, !dbg !15
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !15
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, i32* %diff, align 4, !dbg !19, !tbaa !11
+  call void @ibar(i32* %diff) #2, !dbg !22
+  ret i32 0, !dbg !23
+}
+
+declare void @ibar(i32*) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
new file mode 100644
index 00000000000..d1473552c98
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define i32 @accum(i32* nocapture readonly %x, i32 %N) #0 {
+entry:
+; CHECK-LABEL: @accum
+; CHECK-NOT: x i32>
+
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.inc.preheader, label %for.end
+
+for.inc.preheader:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.inc.preheader ]
+  %sum.02 = phi i32 [ %add, %for.inc ], [ 0, %for.inc.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.inc
+
+for.end.loopexit:
+  %add.lcssa = phi i32 [ %add, %for.inc ]
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %sum.0.lcssa
+
+; CHECK: ret i32
+}
+
+attributes #0 = { "target-cpu"="core2" "target-features"="+sse,-avx,-avx2,-sse2" }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..0b23d6286f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -0,0 +1,826 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+; In this case a scalar epilogue is not needed.
+;
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+; Since the last (second) member of the load-group is a gap, peeling is used,
+; so we also expect to find a scalar epilogue loop.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED: for.body:
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Exactly the same scenario except we are now optimizing for size, therefore
+; we check that no scalar epilogue is created. Since we can't create an epilog
+; we need the ability to mask out the gaps.
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads with the mask properly shuffled and
+; And-ed with the gaps mask.
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+; Accesses with gaps under Optsize scenario again, with unknown trip-count
+; this time, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality checks; So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.010, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
+; with a different stride.
+; So accesses are with gaps under Optsize scenario again, with unknown trip-
+; count, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[3*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+;
+define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = mul nsw i32 %ix.010, 3
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Back to stride 2 with gaps with a known trip count under opt for size,
+; but this time the load/store are not predicated. 
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks because we have gaps and we can't
+; create an epilog. The access is thus scalarized.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads (masking away the gaps).
+;
+; void unconditional_strided1_optsize(const unsigned char* restrict p,
+;                                unsigned char* restrict q,
+;                                unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = shl nuw nsw i32 %ix.06, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.06, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+
+; Unconditioal accesses with gaps under Optsize scenario again, with unknown
+; trip-count this time, in order to check the behavior of folding-the-tail 
+; (folding the remainder loop into the main loop using masking) together with
+; interleaved-groups. Folding-the-tail turns the accesses to conditional which
+; requires proper masking. In addition we need to mask out the gaps (all
+; because we are not allowed to use an epilog due to optsize).
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks. So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The shuffled mask is also And-ed with the gaps mask.
+;
+;   for(ix=0; ix < n; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISBLED_MASKED_STRIDED-NOT:    for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.07, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Full groups again, this time checking an Optsize scenario, with unknown trip-
+; count, to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality check, so nothing to check here.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+;
+; void masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i32 [[TMP1]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add <8 x i32> {{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP13]], 
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp22 = icmp sgt i32 %n, 0
+  br i1 %cmp22, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %ix.023, %guard
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.023, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx3, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx5, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx9, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.023, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Full groups under Optsize scenario again, with unknown trip-count, again in
+; order to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; This time the accesses are not conditional, they become conditional only
+; due to tail folding.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during cost-model checks, so we check for no epilogue and
+; scalarized conditional accesses.
+; When masked-interleave-group is enabled we check for no epilogue,
+; and interleave-groups vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; (Same vectorization scheme as for the previous loop with conditional accesses
+; except here the mask only masks away the remainder iterations.)
+;
+; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+; }
+;}
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP0]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp20 = icmp sgt i32 %n, 0
+  br i1 %cmp20, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.021, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx2, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx4, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx8, align 1
+  %inc = add nuw nsw i32 %ix.021, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
new file mode 100644
index 00000000000..7d5526fe391
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
@@ -0,0 +1,60 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; This test checks the fix for PR39099.
+;
+; Check that the predicated load is not vectorized as an
+; interleaved-group (which requires proper masking, currently unsupported)
+; but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+;
+; void masked_strided(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;CHECK-LABEL: @masked_strided(
+;CHECK: vector.body:
+;CHECK-NEXT:  %index = phi i32 
+;CHECK-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;CHECK-NEXT:  %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;CHECK-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;CHECK-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;CHECK-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;CHECK-NOT:   %[[WIDEVEC:.+]] = load <16 x i8>, <16 x i8>* %{{.*}}, align 1
+;CHECK-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+define dso_local void @masked_strided(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
new file mode 100644
index 00000000000..45d843c9d58
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -0,0 +1,98 @@
+; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt -mcpu=skylake-avx512 -S -force-vector-width=8 -force-vector-interleave=1 -loop-vectorize < %s | FileCheck %s --check-prefix=SINK-GATHER
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: predicated_sdiv_masked_load
+;
+; This test ensures that we don't scalarize the predicated load. Since the load
+; can be vectorized with predication, scalarizing it would cause its pointer
+; operand to become non-uniform.
+;
+; CHECK: vector.body:
+; CHECK:   %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T0:.+]] = extractelement <2 x i32> %wide.masked.load, i32 0
+; CHECK:   %[[T1:.+]] = sdiv i32 %[[T0]], %x
+; CHECK:   %[[T2:.+]] = insertelement <2 x i32> undef, i32 %[[T1]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T3:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T2]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T4:.+]] = extractelement <2 x i32> %wide.masked.load, i32 1
+; CHECK:   %[[T5:.+]] = sdiv i32 %[[T4]], %x
+; CHECK:   %[[T6:.+]] = insertelement <2 x i32> %[[T3]], i32 %[[T5]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i32> [ %[[T3]], %[[CONT0]] ], [ %[[T6]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp7, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = load i32, i32* %tmp2, align 4
+  %tmp4 = sdiv i32 %tmp3, %x
+  %tmp5 = add nsw i32 %tmp4, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp6 = phi i32 [ %tmp1, %for.body ], [ %tmp5, %if.then]
+  %tmp7 = add i32 %r, %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 10000
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp8 = phi i32 [ %tmp7, %for.inc ]
+  ret i32 %tmp8
+}
+
+; This test ensures that a load, which would have been widened otherwise is
+; instead scalarized if Cost-Model so decided as part of its
+; sink-scalar-operands optimization for predicated instructions.
+;
+; SINK-GATHER: vector.body:
+; SINK-GATHER: pred.udiv.if:
+; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
+; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
+; SINK-GATHER: pred.udiv.continue:
+define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %i7 = mul i64 %i, 777
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i7
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp4 = udiv i32 %tmp2, %x
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %x, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
new file mode 100644
index 00000000000..0debb338486
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -0,0 +1,29 @@
+; RUN: opt -O3 -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@x = common global [1024 x x86_fp80] zeroinitializer, align 16
+
+;CHECK-LABEL: @example(
+;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
+;CHECK: store
+;CHECK: ret void
+
+define void @example() nounwind ssp uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sitofp i32 1 to x86_fp80
+  %arrayidx = getelementptr inbounds [1024 x x86_fp80], [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
+  store x86_fp80 %conv, x86_fp80* %arrayidx, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/XCore/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/XCore/lit.local.cfg
new file mode 100644
index 00000000000..bb48713fe33
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/XCore/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'XCore' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll b/llvm/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll
new file mode 100644
index 00000000000..afb3322dd67
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/XCore/no-vector-registers.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S -mtriple=xcore | FileCheck %s
+
+target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
+target triple = "xcore"
+; The xcore target has no vector registers, so loop should not be vectorized.
+;CHECK-LABEL: @f(
+;CHECK: entry:
+;CHECK-NOT: vector.body
+;CHECK-NEXT: br label %do.body
+define void @f(i8* nocapture %ptr, i32 %len) {
+entry:
+  br label %do.body
+do.body:
+  %ptr.addr.0 = phi i8* [ %ptr, %entry ], [ %incdec.ptr, %do.body ]
+  %len.addr.0 = phi i32 [ %len, %entry ], [ %dec, %do.body ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.0, i32 1
+  store i8 0, i8* %ptr.addr.0, align 1
+  %dec = add nsw i32 %len.addr.0, -1
+  %tobool = icmp eq i32 %len.addr.0, 0
+  br i1 %tobool, label %do.end, label %do.body
+do.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/align.ll b/llvm/test/Transforms/LoopVectorize/align.ll
new file mode 100644
index 00000000000..3707f7dcb08
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/align.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we output the abi alignment if no alignment is specified.
+
+;CHECK-LABEL: @align
+;CHECK: load <4 x i32>, <4 x i32>* {{.*}} align  4
+;CHECK: load <4 x i32>, <4 x i32>* {{.*}} align  4
+;CHECK: store <4 x i32> {{.*}} align  4
+
+define void @align(i32* %a, i32* %b, i32* %c) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %3 = load i32, i32* %2
+  %4 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %5 = load i32, i32* %4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %6, i32* %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 128 
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/bsd_regex.ll b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
new file mode 100644
index 00000000000..ddb00ba31e1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=2 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;PR 15830.
+
+;CHECK-LABEL: @foo(
+; When scalarizing stores we need to preserve the original order.
+; Make sure that we are extracting in the correct order (0101, and not 0011).
+;CHECK: extractelement <2 x i64> {{.*}}, i32 0
+;CHECK: extractelement <2 x i64> {{.*}}, i32 1
+;CHECK: extractelement <2 x i64> {{.*}}, i32 0
+;CHECK: extractelement <2 x i64> {{.*}}, i32 1
+;CHECK: store
+;CHECK: store
+;CHECK: store
+;CHECK: store
+;CHECK: ret
+
+define i32 @foo(i32* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  store i32 4, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/bzip_reverse_loops.ll b/llvm/test/Transforms/LoopVectorize/bzip_reverse_loops.ll
new file mode 100644
index 00000000000..70c41f7ce12
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/bzip_reverse_loops.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK: fc
+;CHECK: load <4 x i16>
+;CHECK-NEXT: shufflevector <4 x i16>
+;CHECK: select <4 x i1>
+;CHECK: store <4 x i16>
+;CHECK: ret
+define void @fc(i16* nocapture %p, i32 %n, i32 %size) nounwind uwtable ssp {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %cond.end, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %cond.end ]
+  %p.addr.0 = phi i16* [ %p, %entry ], [ %incdec.ptr, %cond.end ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %p.addr.0, i64 -1
+  %0 = load i16, i16* %incdec.ptr, align 2
+  %conv = zext i16 %0 to i32
+  %cmp = icmp ult i32 %conv, %size
+  br i1 %cmp, label %cond.end, label %cond.true
+
+cond.true:                                        ; preds = %do.body
+  %sub = sub i32 %conv, %size
+  %phitmp = trunc i32 %sub to i16
+  br label %cond.end
+
+cond.end:                                         ; preds = %do.body, %cond.true
+  %cond = phi i16 [ %phitmp, %cond.true ], [ 0, %do.body ]
+  store i16 %cond, i16* %incdec.ptr, align 2
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %cond.end
+  ret void
+}
+
+;CHECK: example1
+;CHECK: load <4 x i32>
+;CHECK-NEXT: shufflevector <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define void @example1(i32* nocapture %a, i32 %n, i32 %wsize) nounwind uwtable ssp {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.body ]
+  %p.0 = phi i32* [ %a, %entry ], [ %incdec.ptr, %do.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %p.0, i64 -1
+  %0 = load i32, i32* %incdec.ptr, align 4
+  %cmp = icmp slt i32 %0, %wsize
+  %sub = sub nsw i32 %0, %wsize
+  %cond = select i1 %cmp, i32 0, i32 %sub
+  store i32 %cond, i32* %incdec.ptr, align 4
+  %dec = add nsw i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/calloc.ll b/llvm/test/Transforms/LoopVectorize/calloc.ll
new file mode 100644
index 00000000000..14fae33589b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/calloc.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK: hexit
+;CHECK: zext <4 x i8>
+;CHECK: ret
+
+define noalias i8* @hexit(i8* nocapture %bytes, i64 %length) nounwind uwtable ssp {
+entry:
+  %shl = shl i64 %length, 1
+  %add28 = or i64 %shl, 1
+  %call = tail call i8* @calloc(i64 1, i64 %add28) nounwind
+  %cmp29 = icmp eq i64 %shl, 0
+  br i1 %cmp29, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = shl i64 %length, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %i.030 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %shr = lshr i64 %i.030, 1
+  %arrayidx = getelementptr inbounds i8, i8* %bytes, i64 %shr
+  %1 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %1 to i32
+  %and = shl i64 %i.030, 2
+  %neg = and i64 %and, 4
+  %and3 = xor i64 %neg, 4
+  %sh_prom = trunc i64 %and3 to i32
+  %shl4 = shl i32 15, %sh_prom
+  %and5 = and i32 %conv, %shl4
+  %shr11 = lshr i32 %and5, %sh_prom
+  %conv13 = and i32 %shr11, 254
+  %cmp15 = icmp ugt i32 %conv13, 9
+  %cond = select i1 %cmp15, i32 87, i32 48
+  %add17 = add nsw i32 %cond, %shr11
+  %conv18 = trunc i32 %add17 to i8
+  %arrayidx19 = getelementptr inbounds i8, i8* %call, i64 %i.030
+  store i8 %conv18, i8* %arrayidx19, align 1
+  %inc = add i64 %i.030, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i8* %call
+}
+
+declare noalias i8* @calloc(i64, i64) nounwind
diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
new file mode 100644
index 00000000000..9c2b131c942
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; rdar://problem/12848162
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example12(
+;CHECK: %vec.ind1 = phi <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll b/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
new file mode 100644
index 00000000000..be6de28b83f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/conditional-assignment.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; CHECK: remark: source.c:2:8: the cost-model indicates that vectorization is not beneficial
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind ssp uwtable
+define void @conditional_store(i32* noalias nocapture %indices) #0 !dbg !4 {
+entry:
+  br label %for.body, !dbg !10
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %indices, i64 %indvars.iv, !dbg !12
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !12, !tbaa !14
+  %cmp1 = icmp eq i32 %0, 1024, !dbg !12
+  br i1 %cmp1, label %if.then, label %for.inc, !dbg !12
+
+if.then:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4, !dbg !18, !tbaa !14
+  br label %for.inc, !dbg !18
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !10
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !10
+
+for.end:                                          ; preds = %for.inc
+  ret void, !dbg !19
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0", isOptimized: true, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.c", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "conditional_store", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.6.0"}
+!10 = !DILocation(line: 2, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 2, column: 3, file: !1, scope: !4)
+!12 = !DILocation(line: 3, column: 9, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 9, file: !1, scope: !11)
+!14 = !{!15, !15, i64 0}
+!15 = !{!"int", !16, i64 0}
+!16 = !{!"omnipotent char", !17, i64 0}
+!17 = !{!"Simple C/C++ TBAA"}
+!18 = !DILocation(line: 3, column: 29, scope: !13)
+!19 = !DILocation(line: 4, column: 1, scope: !4)
diff --git a/llvm/test/Transforms/LoopVectorize/consec_no_gep.ll b/llvm/test/Transforms/LoopVectorize/consec_no_gep.ll
new file mode 100644
index 00000000000..253fd184d08
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/consec_no_gep.ll
@@ -0,0 +1,42 @@
+;RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;; Check consecutive memory access without preceding GEP instruction
+
+;  for (int i=0; i<len; i++) {
+;    *to++ = *from++;
+;  }
+
+; CHECK-LABEL: @consecutive_no_gep(
+; CHECK: vector.body
+; CHECK: %[[index:.*]] = phi i64 [ 0, %vector.ph ]
+; CHECK: getelementptr float, float* %{{.*}}, i64 %[[index]]
+; CHECK: load <4 x float>
+
+define void @consecutive_no_gep(float* noalias nocapture readonly %from, float* noalias nocapture %to, i32 %len) #0 {
+entry:
+  %cmp2 = icmp sgt i32 %len, 0
+  br i1 %cmp2, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %from.addr.04 = phi float* [ %incdec.ptr, %for.body ], [ %from, %for.body.preheader ]
+  %to.addr.03 = phi float* [ %incdec.ptr1, %for.body ], [ %to, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds float, float* %from.addr.04, i64 1
+  %val = load float, float* %from.addr.04, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %to.addr.03, i64 1
+  store float %val, float* %to.addr.03, align 4
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %inc, %len
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
new file mode 100644
index 00000000000..c942d26af86
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -0,0 +1,490 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s --check-prefix=INTER
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+%pair = type { i32, i32 }
+
+; CHECK-LABEL: consecutive_ptr_forward
+;
+; Check that a forward consecutive pointer is recognized as uniform and remains
+; uniform after vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NOT:   getelementptr
+; CHECK:       getelementptr inbounds i32, i32* %a, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @consecutive_ptr_forward(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: consecutive_ptr_reverse
+;
+; Check that a reverse consecutive pointer is recognized as uniform and remains
+; uniform after vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %offset.idx = sub i64 %n, %index
+; CHECK-NOT:   getelementptr
+; CHECK:       %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 -3
+; CHECK:       getelementptr inbounds i32, i32* %[[G0]], i64 %offset.idx
+; CHECK-NOT:   getelementptr
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @consecutive_ptr_reverse(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ %n, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  %i.next = add nuw nsw i64 %i, -1
+  %cond = icmp sgt i64 %i.next, 0
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: interleaved_access_forward
+; INTER-LABEL: interleaved_access_forward
+;
+; Check that a consecutive-like pointer used by a forward interleaved group is
+; recognized as uniform and remains uniform after vectorization. When
+; interleaved memory accesses aren't enabled, the pointer should not be
+; recognized as uniform, and it should not be uniform after vectorization.
+;
+; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %index, i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 1
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+; INTER:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; INTER:     LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTER-NOT:   getelementptr
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; INTER-NOT:   getelementptr
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @interleaved_access_forward(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp6, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i32, i32* %tmp1, align 8
+  %tmp4 = load i32, i32* %tmp2, align 8
+  %tmp5 = add i32 %tmp3, %tmp4
+  %tmp6 = add i32 %tmp0, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp14 = phi i32 [ %tmp6, %for.body ]
+  ret i32 %tmp14
+}
+
+; CHECK-LABEL: interleaved_access_reverse
+; INTER-LABEL: interleaved_access_reverse
+;
+; Check that a consecutive-like pointer used by a reverse interleaved group is
+; recognized as uniform and remains uniform after vectorization. When
+; interleaved memory accesses aren't enabled, the pointer should not be
+; recognized as uniform, and it should not be uniform after vectorization.
+;
+; recognized as uniform, and it should not be uniform after vectorization.
+; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %offset.idx = sub i64 %n, %index
+; CHECK:       %[[I1:.+]] = add i64 %offset.idx, -1
+; CHECK:       %[[I2:.+]] = add i64 %offset.idx, -2
+; CHECK:       %[[I3:.+]] = add i64 %offset.idx, -3
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 1
+; CHECK:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 1
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+; INTER:     LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; INTER:     LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTER:       %offset.idx = sub i64 %n, %index
+; INTER-NOT:   getelementptr
+; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0
+; INTER:       getelementptr inbounds i32, i32* %[[G0]], i64 -6
+; INTER-NOT:   getelementptr
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @interleaved_access_reverse(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ %n, %entry ]
+  %tmp0 = phi i32 [ %tmp6, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i32, i32* %tmp1, align 8
+  %tmp4 = load i32, i32* %tmp2, align 8
+  %tmp5 = add i32 %tmp3, %tmp4
+  %tmp6 = add i32 %tmp0, %tmp5
+  %i.next = add nuw nsw i64 %i, -1
+  %cond = icmp sgt i64 %i.next, 0
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp14 = phi i32 [ %tmp6, %for.body ]
+  ret i32 %tmp14
+}
+
+; INTER-LABEL: predicated_store
+;
+; Check that a consecutive-like pointer used by a forward interleaved group and
+; scalarized store is not recognized as uniform and is not uniform after
+; vectorization. The store is scalarized because it's in a predicated block.
+; Even though the load in this example is vectorized and only uses the pointer
+; as if it were uniform, the store is scalarized, making the pointer
+; non-uniform.
+;
+; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ]
+; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
+; INTER:       %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>*
+; INTER:       %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8
+; INTER:       %[[I1:.+]] = or i64 %index, 1
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
+; INTER:       %[[I2:.+]] = or i64 %index, 2
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
+; INTER:       %[[I3:.+]] = or i64 %index, 3
+; INTER:       getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @predicated_store(%pair *%p, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i32, i32* %tmp0, align 8
+  %tmp2 = icmp eq i32 %tmp1, %x
+  br i1 %tmp2, label %if.then, label %if.merge
+
+if.then:
+  store i32 %tmp1, i32* %tmp0, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: irregular_type
+;
+; Check that a consecutive pointer used by a scalarized store is not recognized
+; as uniform and is not uniform after vectorization. The store is scalarized
+; because the stored type may required padding.
+;
+; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %index
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I1]]
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I2]]
+; CHECK:       getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I3]]
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @irregular_type(x86_fp80* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = sitofp i32 1 to x86_fp80
+  %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i
+  store x86_fp80 %tmp0, x86_fp80* %tmp1, align 16
+  %i.next = add i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_uniform
+;
+; Check that a pointer induction variable is recognized as uniform and remains
+; uniform after vectorization.
+;
+; CHECK:     LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NOT:   getelementptr
+; CHECK:       %next.gep = getelementptr i32, i32* %a, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_uniform(i32* %a, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+  store i32 %x, i32* %p, align 8
+  %tmp03 = getelementptr inbounds i32, i32* %p, i32 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; INTER-LABEL: pointer_iv_non_uniform_0
+;
+; Check that a pointer induction variable with a non-uniform user is not
+; recognized as uniform and is not uniform after vectorization. The pointer
+; induction variable is used by getelementptr instructions that are non-uniform
+; due to scalarization of the stores.
+;
+; INTER-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+; INTER:     vector.body
+; INTER:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTER:       %[[I0:.+]] = shl i64 %index, 2
+; INTER:       %next.gep = getelementptr i32, i32* %a, i64 %[[I0]]
+; INTER:       %[[S1:.+]] = shl i64 %index, 2
+; INTER:       %[[I1:.+]] = or i64 %[[S1]], 4
+; INTER:       %next.gep2 = getelementptr i32, i32* %a, i64 %[[I1]]
+; INTER:       %[[S2:.+]] = shl i64 %index, 2
+; INTER:       %[[I2:.+]] = or i64 %[[S2]], 8
+; INTER:       %next.gep3 = getelementptr i32, i32* %a, i64 %[[I2]]
+; INTER:       %[[S3:.+]] = shl i64 %index, 2
+; INTER:       %[[I3:.+]] = or i64 %[[S3]], 12
+; INTER:       %next.gep4 = getelementptr i32, i32* %a, i64 %[[I3]]
+; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_non_uniform_0(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp03, %for.body ], [ %a, %entry ]
+  %tmp00 = load i32, i32* %p, align 8
+  %tmp01 = getelementptr inbounds i32, i32* %p, i32 1
+  %tmp02 = load i32, i32* %tmp01, align 8
+  %tmp03 = getelementptr inbounds i32, i32* %p, i32 4
+  %tmp04 = load i32, i32* %tmp03, align 8
+  %tmp05 = getelementptr inbounds i32, i32* %p, i32 5
+  %tmp06 = load i32, i32* %tmp05, align 8
+  %tmp07 = sub i32 %tmp04, %tmp00
+  %tmp08 = sub i32 %tmp02, %tmp02
+  %tmp09 = getelementptr inbounds i32, i32* %p, i32 2
+  store i32 %tmp07, i32* %tmp09, align 8
+  %tmp10 = getelementptr inbounds i32, i32* %p, i32 3
+  store i32 %tmp08, i32* %tmp10, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_non_uniform_1
+;
+; Check that a pointer induction variable with a non-uniform user is not
+; recognized as uniform and is not uniform after vectorization. The pointer
+; induction variable is used by a store that will be scalarized.
+;
+; CHECK-NOT: LV: Found uniform instruction: %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %next.gep = getelementptr x86_fp80, x86_fp80* %a, i64 %index
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %next.gep2 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I1]]
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %next.gep3 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I2]]
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       %next.gep4 = getelementptr x86_fp80, x86_fp80* %a, i64 %[[I3]]
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @pointer_iv_non_uniform_1(x86_fp80* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi x86_fp80* [%tmp1, %for.body], [%a, %entry]
+  %tmp0 = sitofp i32 1 to x86_fp80
+  store x86_fp80 %tmp0, x86_fp80* %p, align 16
+  %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %p, i32 1
+  %i.next = add i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: pointer_iv_mixed
+;
+; Check multiple pointer induction variables where only one is recognized as
+; uniform and remains uniform after vectorization. The other pointer induction
+; variable is not recognized as uniform and is not uniform after vectorization
+; because it is stored to memory.
+;
+; CHECK-NOT: LV: Found uniform instruction: %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
+; CHECK:     LV: Found uniform instruction: %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+; CHECK:     vector.body
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %next.gep = getelementptr i32, i32* %a, i64 %index
+; CHECK:       %[[I1:.+]] = or i64 %index, 1
+; CHECK:       %next.gep10 = getelementptr i32, i32* %a, i64 %[[I1]]
+; CHECK:       %[[I2:.+]] = or i64 %index, 2
+; CHECK:       %next.gep11 = getelementptr i32, i32* %a, i64 %[[I2]]
+; CHECK:       %[[I3:.+]] = or i64 %index, 3
+; CHECK:       %next.gep12 = getelementptr i32, i32* %a, i64 %[[I3]]
+; CHECK:       %[[V0:.+]] = insertelement <4 x i32*> undef, i32* %next.gep, i32 0
+; CHECK:       %[[V1:.+]] = insertelement <4 x i32*> %[[V0]], i32* %next.gep10, i32 1
+; CHECK:       %[[V2:.+]] = insertelement <4 x i32*> %[[V1]], i32* %next.gep11, i32 2
+; CHECK:       %[[V3:.+]] = insertelement <4 x i32*> %[[V2]], i32* %next.gep12, i32 3
+; CHECK-NOT:   getelementptr
+; CHECK:       %next.gep13 = getelementptr i32*, i32** %b, i64 %index
+; CHECK-NOT:   getelementptr
+; CHECK:       %[[B0:.+]] = bitcast i32** %next.gep13 to <4 x i32*>*
+; CHECK:       store <4 x i32*> %[[V3]], <4 x i32*>* %[[B0]], align 8
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @pointer_iv_mixed(i32* %a, i32** %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
+  %q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
+  %tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
+  %tmp1 = load i32, i32* %p, align 8
+  %tmp2 = add i32 %tmp1, %tmp0
+  store i32* %p, i32** %q, align 8
+  %tmp3 = getelementptr inbounds i32, i32* %p, i32 1
+  %tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp5 = phi i32 [ %tmp2, %for.body ]
+  ret i32 %tmp5
+}
+
+; INTER-LABEL: bitcast_pointer_operand
+;
+; Check that a pointer operand having a user other than a memory access is
+; recognized as uniform after vectorization. In this test case, %tmp1 is a
+; bitcast that is used by a load and a getelementptr instruction (%tmp2). Once
+; %tmp2 is marked uniform, %tmp1 should be marked uniform as well.
+;
+; INTER:       LV: Found uniform instruction: %cond = icmp slt i64 %i.next, %n
+; INTER-NEXT:  LV: Found uniform instruction: %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+; INTER-NEXT:  LV: Found uniform instruction: %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %tmp1 = bitcast i64* %tmp0 to i8*
+; INTER-NEXT:  LV: Found uniform instruction: %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+; INTER-NEXT:  LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 1
+; INTER:       vector.body:
+; INTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; INTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* %A, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <32 x i8>*
+; INTER-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i8>, <32 x i8>* [[TMP5]], align 1
+; INTER-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; INTER-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
+; INTER-NEXT:    [[TMP6:%.*]] = xor <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
+; INTER-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* %B, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; INTER-NEXT:    store <4 x i8> [[TMP6]], <4 x i8>* [[TMP8]], align 1
+; INTER-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; INTER:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @bitcast_pointer_operand(i64* %A, i8* %B, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i8*
+  %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %tmp4 = load i8, i8* %tmp1, align 1
+  %tmp5 = xor i8 %tmp3, %tmp4
+  %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+  store i8 %tmp5, i8* %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/control-flow.ll b/llvm/test/Transforms/LoopVectorize/control-flow.ll
new file mode 100644
index 00000000000..8d56a17994a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/control-flow.ll
@@ -0,0 +1,77 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S -pass-remarks-missed='loop-vectorize' 2>&1 | FileCheck %s
+
+; C/C++ code for control flow test
+; int test(int *A, int Length) {
+;   for (int i = 0; i < Length; i++) {
+;     if (A[i] > 10.0) goto end;
+;     A[i] = 0;
+;   }
+; end:
+;   return 0;
+; }
+
+; CHECK: remark: source.cpp:5:9: loop not vectorized: loop control flow is not understood by vectorizer
+; CHECK: remark: source.cpp:5:9: loop not vectorized
+
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define i32 @_Z4testPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp8 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp8, label %for.body.preheader, label %end, !dbg !10
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !12
+
+for.body:                                         ; preds = %for.body.preheader, %if.else
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.else ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !12
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !12, !tbaa !15
+  %cmp1 = icmp sgt i32 %0, 10, !dbg !12
+  br i1 %cmp1, label %end.loopexit, label %if.else, !dbg !12
+
+if.else:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4, !dbg !19, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %1 = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %cmp = icmp slt i32 %1, %Length, !dbg !10
+  br i1 %cmp, label %for.body, label %end.loopexit, !dbg !10
+
+end.loopexit:                                     ; preds = %if.else, %for.body
+  br label %end
+
+end:                                              ; preds = %end.loopexit, %entry
+  ret i32 0, !dbg !20
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 3, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!12 = !DILocation(line: 5, column: 9, scope: !13)
+!13 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !14)
+!14 = distinct !DILexicalBlock(line: 4, column: 3, file: !1, scope: !11)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C/C++ TBAA"}
+!19 = !DILocation(line: 8, column: 7, scope: !13)
+!20 = !DILocation(line: 12, column: 3, scope: !4)
diff --git a/llvm/test/Transforms/LoopVectorize/cpp-new-array.ll b/llvm/test/Transforms/LoopVectorize/cpp-new-array.ll
new file mode 100644
index 00000000000..53703ccea7c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @cpp_new_arrays(
+;CHECK: sext i32
+;CHECK: load <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK: ret i32
+define i32 @cpp_new_arrays() uwtable ssp {
+entry:
+  %call = call noalias i8* @_Znwm(i64 4)
+  %0 = bitcast i8* %call to float*
+  store float 1.000000e+03, float* %0, align 4
+  %call1 = call noalias i8* @_Znwm(i64 4)
+  %1 = bitcast i8* %call1 to float*
+  store float 1.000000e+03, float* %1, align 4
+  %call3 = call noalias i8* @_Znwm(i64 4)
+  %2 = bitcast i8* %call3 to float*
+  store float 1.000000e+03, float* %2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %idxprom
+  %3 = load float, float* %arrayidx, align 4
+  %idxprom5 = sext i32 %i.01 to i64
+  %arrayidx6 = getelementptr inbounds float, float* %1, i64 %idxprom5
+  %4 = load float, float* %arrayidx6, align 4
+  %add = fadd float %3, %4
+  %idxprom7 = sext i32 %i.01 to i64
+  %arrayidx8 = getelementptr inbounds float, float* %2, i64 %idxprom7
+  store float %add, float* %arrayidx8, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %5 = load float, float* %2, align 4
+  %conv10 = fptosi float %5 to i32
+  ret i32 %conv10
+}
+
+declare noalias i8* @_Znwm(i64)
diff --git a/llvm/test/Transforms/LoopVectorize/dbg.value.ll b/llvm/test/Transforms/LoopVectorize/dbg.value.ll
new file mode 100644
index 00000000000..cc8df301982
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/dbg.value.ll
@@ -0,0 +1,77 @@
+; RUN: opt < %s -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine | FileCheck %s
+; Make sure we vectorize with debugging turned on.
+
+source_filename = "test/Transforms/LoopVectorize/dbg.value.ll"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@A = global [1024 x i32] zeroinitializer, align 16, !dbg !0
+@B = global [1024 x i32] zeroinitializer, align 16, !dbg !7
+@C = global [1024 x i32] zeroinitializer, align 16, !dbg !9
+; CHECK-LABEL: @test(
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @test() #0 !dbg !15 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 0, metadata !19, metadata !21), !dbg !22
+  br label %for.body, !dbg !22
+
+for.body:                                         ; preds = %for.body, %entry
+  ;CHECK: load <4 x i32>
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv, !dbg !23
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !23
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @C, i64 0, i64 %indvars.iv, !dbg !23
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !23
+  %add = add nsw i32 %1, %0, !dbg !23
+  %arrayidx4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv, !dbg !23
+  store i32 %add, i32* %arrayidx4, align 4, !dbg !23
+  %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !22
+  tail call void @llvm.dbg.value(metadata !12, metadata !19, metadata !21), !dbg !22
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !22
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024, !dbg !22
+  br i1 %exitcond, label %for.body, label %for.end, !dbg !22
+
+for.end:                                          ; preds = %for.body
+  ret i32 0, !dbg !25
+}
+
+; Function Attrs: nounwind readnone
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!11}
+!llvm.module.flags = !{!14}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = !DIGlobalVariable(name: "A", scope: null, file: !2, line: 1, type: !3, isLocal: false, isDefinition: true)
+!2 = !DIFile(filename: "test", directory: "/path/to/somewhere")
+!3 = !DICompositeType(tag: DW_TAG_array_type, baseType: !4, size: 32768, align: 32, elements: !5)
+!4 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!5 = !{!6}
+!6 = !{i32 786465, i64 0, i64 1024}
+!7 = !DIGlobalVariableExpression(var: !8, expr: !DIExpression())
+!8 = !DIGlobalVariable(name: "B", scope: null, file: !2, line: 2, type: !3, isLocal: false, isDefinition: true)
+!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+!10 = !DIGlobalVariable(name: "C", scope: null, file: !2, line: 3, type: !3, isLocal: false, isDefinition: true)
+!11 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !12, retainedTypes: !12, globals: !13)
+!12 = !{}
+!13 = !{!0, !7, !9}
+!14 = !{i32 1, !"Debug Info Version", i32 3}
+!15 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !2, file: !2, line: 5, type: !16, isLocal: false, isDefinition: true, scopeLine: 5, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !11, retainedNodes: !18)
+!16 = !DISubroutineType(types: !17)
+!17 = !{!4}
+!18 = !{!19}
+!19 = !DILocalVariable(name: "i", scope: !20, file: !2, line: 6, type: !4)
+!20 = distinct !DILexicalBlock(scope: !15, file: !2, line: 6)
+!21 = !DIExpression()
+!22 = !DILocation(line: 6, scope: !20)
+!23 = !DILocation(line: 7, scope: !24)
+!24 = distinct !DILexicalBlock(scope: !20, file: !2, line: 6)
+!25 = !DILocation(line: 9, scope: !15)
+
diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
new file mode 100644
index 00000000000..fb929ee4ebd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @dead_instructions_01
+;
+; This test ensures that we don't generate trivially dead instructions prior to
+; instruction simplification. We don't need to generate instructions
+; corresponding to the original induction variable update or branch condition,
+; since we rewrite the loop structure.
+;
+; CHECK:     vector.body:
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %[[I0:.+]] = add i64 %index, 0
+; CHECK:       %[[I2:.+]] = add i64 %index, 2
+; CHECK:       getelementptr inbounds i64, i64* %a, i64 %[[I0]]
+; CHECK:       getelementptr inbounds i64, i64* %a, i64 %[[I2]]
+; CHECK-NOT:   add nuw nsw i64 %[[I0]], 1
+; CHECK-NOT:   add nuw nsw i64 %[[I2]], 1
+; CHECK-NOT:   icmp slt i64 {{.*}}, %n
+; CHECK:       %index.next = add i64 %index, 4
+; CHECK:       %[[CMP:.+]] = icmp eq i64 %index.next, %n.vec
+; CHECK:       br i1 %[[CMP]], label %middle.block, label %vector.body
+;
+define i64 @dead_instructions_01(i64 *%a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %r = phi i64 [ %tmp2, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp1 = load i64, i64* %tmp0, align 8
+  %tmp2 = add i64 %tmp1, %r
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp3  = phi i64 [ %tmp2, %for.body ]
+  ret i64 %tmp3
+}
diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll
new file mode 100644
index 00000000000..e9ec8662ce4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll
@@ -0,0 +1,89 @@
+; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we are preserving debug info in the vectorized code.
+
+; CHECK: for.body.lr.ph
+; CHECK:   min.iters.check = icmp ult i64 {{.*}}, 2, !dbg !{{[0-9]+}}
+; CHECK: vector.body
+; CHECK:   index {{.*}}, !dbg ![[LOC:[0-9]+]]
+; CHECK:   getelementptr inbounds i32, i32* %a, {{.*}}, !dbg ![[LOC]]
+; CHECK:   load <2 x i32>, <2 x i32>* {{.*}}, !dbg ![[LOC]]
+; CHECK:   add <2 x i32> {{.*}}, !dbg ![[LOC]]
+; CHECK:   add i64 %index, 2, !dbg ![[LOC]]
+; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]]
+; CHECK: middle.block
+; CHECK:   add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[LOC]]
+; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC]]
+
+define i32 @f(i32* nocapture %a, i32 %size) #0 !dbg !4 {
+entry:
+  call void @llvm.dbg.value(metadata i32* %a, metadata !13, metadata !DIExpression()), !dbg !19
+  call void @llvm.dbg.value(metadata i32 %size, metadata !14, metadata !DIExpression()), !dbg !19
+  call void @llvm.dbg.value(metadata i32 0, metadata !15, metadata !DIExpression()), !dbg !20
+  call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !21
+  %cmp4 = icmp eq i32 %size, 0, !dbg !21
+  br i1 %cmp4, label %for.end, label %for.body.lr.ph, !dbg !21
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body, !dbg !21
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %sum.05 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22
+  %add = add i32 %0, %sum.05, !dbg !22
+  %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !22
+  call void @llvm.dbg.value(metadata !{null}, metadata !16, metadata !DIExpression()), !dbg !22
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !22
+  %exitcond = icmp ne i32 %lftr.wideiv, %size, !dbg !22
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge, !dbg !21
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  call void @llvm.dbg.value(metadata i32 %add.lcssa, metadata !15, metadata !DIExpression()), !dbg !22
+  br label %for.end, !dbg !21
+
+for.end:                                          ; preds = %entry, %for.cond.for.end_crit_edge
+  %sum.0.lcssa = phi i32 [ %add.lcssa, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa, !dbg !26
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #0 = { nounwind readonly ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !27}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 185038) (llvm/trunk 185097)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "-", directory: "/Volumes/Data/backedup/dev/os/llvm/debug")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "f", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 3, file: !5, scope: !6, type: !7, retainedNodes: !12)
+!5 = !DIFile(filename: "<stdin>", directory: "/Volumes/Data/backedup/dev/os/llvm/debug")
+!6 = !DIFile(filename: "<stdin>", directory: "/Volumes/Data/backedup/dev/os/llvm/debug")
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !10, !11}
+!9 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9)
+!11 = !DIBasicType(tag: DW_TAG_base_type, name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+!12 = !{!13, !14, !15, !16}
+!13 = !DILocalVariable(name: "a", line: 3, arg: 1, scope: !4, file: !6, type: !10)
+!14 = !DILocalVariable(name: "size", line: 3, arg: 2, scope: !4, file: !6, type: !11)
+!15 = !DILocalVariable(name: "sum", line: 4, scope: !4, file: !6, type: !11)
+!16 = !DILocalVariable(name: "i", line: 5, scope: !17, file: !6, type: !11)
+!17 = distinct !DILexicalBlock(line: 5, column: 0, file: !5, scope: !4)
+!18 = !{i32 2, !"Dwarf Version", i32 3}
+!19 = !DILocation(line: 3, scope: !4)
+!20 = !DILocation(line: 4, scope: !4)
+!21 = !DILocation(line: 5, scope: !17)
+!22 = !DILocation(line: 6, scope: !17)
+!26 = !DILocation(line: 7, scope: !4)
+!27 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
new file mode 100644
index 00000000000..1e5f18e371c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; getDemandedBits() is called on the pointer-typed GEP instruction here.
+; Only make sure we do not crash.
+
+; CHECK: @test
+define void @test(i8* %ptr, i8* %ptr_end) {
+start:
+  br label %loop
+
+loop:
+  %ptr2 = phi i8* [ %ptr3, %loop ], [ %ptr, %start ]
+  %x = sext i8 undef to i64
+  %ptr3 = getelementptr inbounds i8, i8* %ptr2, i64 1
+  %cmp = icmp ult i8* %ptr3, %ptr_end
+  br i1 %cmp, label %loop, label %end
+
+end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
new file mode 100644
index 00000000000..bf4eb0876a9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
@@ -0,0 +1,77 @@
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize < %s 2>&1 | FileCheck %s
+
+;  1     extern int map[];
+;  2     extern int out[];
+;  3
+;  4     void f(int a, int n) {
+;  5       for (int i = 0; i < n; ++i) {
+;  6         out[i] = a;
+;  7         a = map[a];
+;  8       }
+;  9     }
+
+; CHECK: remark: /tmp/s.c:5:3: loop not vectorized: value that could not be identified as reduction is used outside the loop
+
+; %a.addr.08 is the phi corresponding to the remark.  It does not have debug
+; location attached.  In this case we should use the debug location of the
+; loop rather than emitting <unknown>:0:0:
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@out = external local_unnamed_addr global [0 x i32], align 4
+@map = external local_unnamed_addr global [0 x i32], align 4
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @f(i32 %a, i32 %n) local_unnamed_addr #0 !dbg !6 {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0, !dbg !8
+  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup, !dbg !9
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64, !dbg !9
+  br label %for.body, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void, !dbg !11
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.08 = phi i32 [ %0, %for.body ], [ %a, %for.body.preheader ]
+
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @out, i64 0, i64 %indvars.iv, !dbg !10
+  store i32 %a.addr.08, i32* %arrayidx, align 4, !dbg !12, !tbaa !13
+  %idxprom1 = sext i32 %a.addr.08 to i64, !dbg !17
+  %arrayidx2 = getelementptr inbounds [0 x i32], [0 x i32]* @map, i64 0, i64 %idxprom1, !dbg !17
+  %0 = load i32, i32* %arrayidx2, align 4, !dbg !17, !tbaa !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !9
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !9
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !llvm.loop !18
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
+!6 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !7, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 5, column: 21, scope: !6)
+!9 = !DILocation(line: 5, column: 3, scope: !6)
+!10 = !DILocation(line: 6, column: 5, scope: !6)
+!11 = !DILocation(line: 9, column: 1, scope: !6)
+!12 = !DILocation(line: 6, column: 12, scope: !6)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"int", !15, i64 0}
+!15 = !{!"omnipotent char", !16, i64 0}
+!16 = !{!"Simple C/C++ TBAA"}
+!17 = !DILocation(line: 7, column: 9, scope: !6)
+!18 = distinct !{!18, !9}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
new file mode 100644
index 00000000000..34641849d88
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -0,0 +1,200 @@
+; RUN: opt -S -loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
+; RUN: opt -S -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
+
+;   1	void cold(char *A, char *B, char *C, char *D, char *E, int N) {
+;   2	  for(int i = 0; i < N; i++) {
+;   3	    A[i + 1] = A[i] + B[i];
+;   4	    C[i] = D[i] * E[i];
+;   5	  }
+;   6	}
+;   7
+;   8	void hot(char *A, char *B, char *C, char *D, char *E, int N) {
+;   9	  for(int i = 0; i < N; i++) {
+;  10	    A[i + 1] = A[i] + B[i];
+;  11	    C[i] = D[i] * E[i];
+;  12	  }
+;  13	}
+;  14
+;  15	void unknown(char *A, char *B, char *C, char *D, char *E, int N) {
+;  16	  for(int i = 0; i < N; i++) {
+;  17	    A[i + 1] = A[i] + B[i];
+;  18	    C[i] = D[i] * E[i];
+;  19	  }
+;  20	}
+
+; CHECK: remark: /tmp/s.c:2:3: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop (hotness: 300)
+; CHECK: remark: /tmp/s.c:9:3: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop (hotness: 5000)
+; CHECK: remark: /tmp/s.c:16:3: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop{{$}}
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !9
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !12
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !12, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !16
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !16, !tbaa !13
+  %add = add i8 %1, %0, !dbg !17
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !18
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !19, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !20
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !20, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !21
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !21, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !22
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !23
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !24, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !10
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !10, !llvm.loop !25, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !11
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !27
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !30
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !30, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !31
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !31, !tbaa !13
+  %add = add i8 %1, %0, !dbg !32
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !28
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !33
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !34, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !35
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !35, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !36
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !36, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !37
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !38
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !39, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !28
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !28
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !28, !llvm.loop !40, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !29
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !42
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !43
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !45
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !45, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !46
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !46, !tbaa !13
+  %add = add i8 %1, %0, !dbg !47
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !43
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !48
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !49, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !50
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !50, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !51
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !51, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !52
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !53
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !54, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !43
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !43
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55
+
+for.cond.cleanup:
+  ret void, !dbg !44
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)"}
+!7 = distinct !DISubprogram(name: "cold", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 2, column: 20, scope: !7)
+!10 = !DILocation(line: 2, column: 3, scope: !7)
+!11 = !DILocation(line: 6, column: 1, scope: !7)
+!12 = !DILocation(line: 3, column: 16, scope: !7)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"omnipotent char", !15, i64 0}
+!15 = !{!"Simple C/C++ TBAA"}
+!16 = !DILocation(line: 3, column: 23, scope: !7)
+!17 = !DILocation(line: 3, column: 21, scope: !7)
+!18 = !DILocation(line: 3, column: 5, scope: !7)
+!19 = !DILocation(line: 3, column: 14, scope: !7)
+!20 = !DILocation(line: 4, column: 12, scope: !7)
+!21 = !DILocation(line: 4, column: 19, scope: !7)
+!22 = !DILocation(line: 4, column: 17, scope: !7)
+!23 = !DILocation(line: 4, column: 5, scope: !7)
+!24 = !DILocation(line: 4, column: 10, scope: !7)
+!25 = distinct !{!25, !10}
+!26 = distinct !DISubprogram(name: "hot", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!27 = !DILocation(line: 9, column: 20, scope: !26)
+!28 = !DILocation(line: 9, column: 3, scope: !26)
+!29 = !DILocation(line: 13, column: 1, scope: !26)
+!30 = !DILocation(line: 10, column: 16, scope: !26)
+!31 = !DILocation(line: 10, column: 23, scope: !26)
+!32 = !DILocation(line: 10, column: 21, scope: !26)
+!33 = !DILocation(line: 10, column: 5, scope: !26)
+!34 = !DILocation(line: 10, column: 14, scope: !26)
+!35 = !DILocation(line: 11, column: 12, scope: !26)
+!36 = !DILocation(line: 11, column: 19, scope: !26)
+!37 = !DILocation(line: 11, column: 17, scope: !26)
+!38 = !DILocation(line: 11, column: 5, scope: !26)
+!39 = !DILocation(line: 11, column: 10, scope: !26)
+!40 = distinct !{!40, !28}
+!41 = distinct !DISubprogram(name: "unknown", scope: !1, file: !1, line: 15, type: !8, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!42 = !DILocation(line: 16, column: 20, scope: !41)
+!43 = !DILocation(line: 16, column: 3, scope: !41)
+!44 = !DILocation(line: 20, column: 1, scope: !41)
+!45 = !DILocation(line: 17, column: 16, scope: !41)
+!46 = !DILocation(line: 17, column: 23, scope: !41)
+!47 = !DILocation(line: 17, column: 21, scope: !41)
+!48 = !DILocation(line: 17, column: 5, scope: !41)
+!49 = !DILocation(line: 17, column: 14, scope: !41)
+!50 = !DILocation(line: 18, column: 12, scope: !41)
+!51 = !DILocation(line: 18, column: 19, scope: !41)
+!52 = !DILocation(line: 18, column: 17, scope: !41)
+!53 = !DILocation(line: 18, column: 5, scope: !41)
+!54 = !DILocation(line: 18, column: 10, scope: !41)
+!55 = distinct !{!55, !43}
+!56 = !{!"function_entry_count", i64 3}
+!57 = !{!"function_entry_count", i64 50}
+!58 = !{!"branch_weights", i32 99, i32 1}
+!59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
new file mode 100644
index 00000000000..3076d319440
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -0,0 +1,213 @@
+; RUN: opt -S -loop-vectorize -pass-remarks-missed=loop-vectorize \
+; RUN:     -pass-remarks-with-hotness < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=HOTNESS -check-prefix=BOTH %s
+
+; RUN: opt -S -loop-vectorize -pass-remarks-missed=loop-vectorize < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=NO_HOTNESS -check-prefix=BOTH %s
+
+
+; RUN: opt -S -passes=loop-vectorize -pass-remarks-missed=loop-vectorize \
+; RUN:     -pass-remarks-with-hotness < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=HOTNESS -check-prefix=BOTH %s
+
+; RUN: opt -S -passes=loop-vectorize \
+; RUN:     -pass-remarks-missed=loop-vectorize < %s 2>&1 | \
+; RUN:     FileCheck -check-prefix=NO_HOTNESS -check-prefix=BOTH %s
+
+
+;   1	void cold(char *A, char *B, char *C, char *D, char *E, int N) {
+;   2	  for(int i = 0; i < N; i++) {
+;   3	    A[i + 1] = A[i] + B[i];
+;   4	    C[i] = D[i] * E[i];
+;   5	  }
+;   6	}
+;   7
+;   8	void hot(char *A, char *B, char *C, char *D, char *E, int N) {
+;   9	  for(int i = 0; i < N; i++) {
+;  10	    A[i + 1] = A[i] + B[i];
+;  11	    C[i] = D[i] * E[i];
+;  12	  }
+;  13	}
+;  14
+;  15	void unknown(char *A, char *B, char *C, char *D, char *E, int N) {
+;  16	  for(int i = 0; i < N; i++) {
+;  17	    A[i + 1] = A[i] + B[i];
+;  18	    C[i] = D[i] * E[i];
+;  19	  }
+;  20	}
+
+; HOTNESS: remark: /tmp/s.c:2:3: loop not vectorized (hotness: 300)
+; NO_HOTNESS: remark: /tmp/s.c:2:3: loop not vectorized{{$}}
+; HOTNESS: remark: /tmp/s.c:9:3: loop not vectorized (hotness: 5000)
+; NO_HOTNESS: remark: /tmp/s.c:9:3: loop not vectorized{{$}}
+; BOTH: remark: /tmp/s.c:16:3: loop not vectorized{{$}}
+
+; ModuleID = '/tmp/s.c'
+source_filename = "/tmp/s.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !9
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !12
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !12, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !16
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !16, !tbaa !13
+  %add = add i8 %1, %0, !dbg !17
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !18
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !19, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !20
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !20, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !21
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !21, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !22
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !23
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !24, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !10
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !10, !llvm.loop !25, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !11
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !27
+  br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
+
+ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %ph ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !30
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !30, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !31
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !31, !tbaa !13
+  %add = add i8 %1, %0, !dbg !32
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !28
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !33
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !34, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !35
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !35, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !36
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !36, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !37
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !38
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !39, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !28
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !28
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !28, !llvm.loop !40, !prof !59
+
+for.cond.cleanup:
+  ret void, !dbg !29
+}
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !42
+  br i1 %cmp28, label %for.body, label %for.cond.cleanup, !dbg !43
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void, !dbg !44
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !45
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !45, !tbaa !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !46
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !46, !tbaa !13
+  %add = add i8 %1, %0, !dbg !47
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !43
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !48
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !49, !tbaa !13
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !50
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !50, !tbaa !13
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !51
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !51, !tbaa !13
+  %mul = mul i8 %3, %2, !dbg !52
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !53
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !54, !tbaa !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !43
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !43
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 3.9.0 (trunk 273572) (llvm/trunk 273585)"}
+!7 = distinct !DISubprogram(name: "cold", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 2, column: 20, scope: !7)
+!10 = !DILocation(line: 2, column: 3, scope: !7)
+!11 = !DILocation(line: 6, column: 1, scope: !7)
+!12 = !DILocation(line: 3, column: 16, scope: !7)
+!13 = !{!14, !14, i64 0}
+!14 = !{!"omnipotent char", !15, i64 0}
+!15 = !{!"Simple C/C++ TBAA"}
+!16 = !DILocation(line: 3, column: 23, scope: !7)
+!17 = !DILocation(line: 3, column: 21, scope: !7)
+!18 = !DILocation(line: 3, column: 5, scope: !7)
+!19 = !DILocation(line: 3, column: 14, scope: !7)
+!20 = !DILocation(line: 4, column: 12, scope: !7)
+!21 = !DILocation(line: 4, column: 19, scope: !7)
+!22 = !DILocation(line: 4, column: 17, scope: !7)
+!23 = !DILocation(line: 4, column: 5, scope: !7)
+!24 = !DILocation(line: 4, column: 10, scope: !7)
+!25 = distinct !{!25, !10}
+!26 = distinct !DISubprogram(name: "hot", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!27 = !DILocation(line: 9, column: 20, scope: !26)
+!28 = !DILocation(line: 9, column: 3, scope: !26)
+!29 = !DILocation(line: 13, column: 1, scope: !26)
+!30 = !DILocation(line: 10, column: 16, scope: !26)
+!31 = !DILocation(line: 10, column: 23, scope: !26)
+!32 = !DILocation(line: 10, column: 21, scope: !26)
+!33 = !DILocation(line: 10, column: 5, scope: !26)
+!34 = !DILocation(line: 10, column: 14, scope: !26)
+!35 = !DILocation(line: 11, column: 12, scope: !26)
+!36 = !DILocation(line: 11, column: 19, scope: !26)
+!37 = !DILocation(line: 11, column: 17, scope: !26)
+!38 = !DILocation(line: 11, column: 5, scope: !26)
+!39 = !DILocation(line: 11, column: 10, scope: !26)
+!40 = distinct !{!40, !28}
+!41 = distinct !DISubprogram(name: "unknown", scope: !1, file: !1, line: 15, type: !8, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!42 = !DILocation(line: 16, column: 20, scope: !41)
+!43 = !DILocation(line: 16, column: 3, scope: !41)
+!44 = !DILocation(line: 20, column: 1, scope: !41)
+!45 = !DILocation(line: 17, column: 16, scope: !41)
+!46 = !DILocation(line: 17, column: 23, scope: !41)
+!47 = !DILocation(line: 17, column: 21, scope: !41)
+!48 = !DILocation(line: 17, column: 5, scope: !41)
+!49 = !DILocation(line: 17, column: 14, scope: !41)
+!50 = !DILocation(line: 18, column: 12, scope: !41)
+!51 = !DILocation(line: 18, column: 19, scope: !41)
+!52 = !DILocation(line: 18, column: 17, scope: !41)
+!53 = !DILocation(line: 18, column: 5, scope: !41)
+!54 = !DILocation(line: 18, column: 10, scope: !41)
+!55 = distinct !{!55, !43}
+!56 = !{!"function_entry_count", i64 3}
+!57 = !{!"function_entry_count", i64 50}
+!58 = !{!"branch_weights", i32 99, i32 1}
+!59 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/disable_nonforced.ll b/llvm/test/Transforms/LoopVectorize/disable_nonforced.ll
new file mode 100644
index 00000000000..7df63ac7e27
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/disable_nonforced.ll
@@ -0,0 +1,29 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Check that the disable_nonforced loop property is honored by the
+; loop vectorizer.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced(
+; CHECK-NOT: x i32>
+define void @disable_nonforced(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}}
diff --git a/llvm/test/Transforms/LoopVectorize/disable_nonforced_enable.ll b/llvm/test/Transforms/LoopVectorize/disable_nonforced_enable.ll
new file mode 100644
index 00000000000..7541ac38999
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/disable_nonforced_enable.ll
@@ -0,0 +1,29 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Check whether the llvm.loop.vectorize.enable loop property overrides
+; llvm.loop.disable_nonforced.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced_enable(
+; CHECK: store <2 x i32>
+define void @disable_nonforced_enable(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.disable_nonforced"}, !{!"llvm.loop.vectorize.enable", i32 1}}
diff --git a/llvm/test/Transforms/LoopVectorize/discriminator.ll b/llvm/test/Transforms/LoopVectorize/discriminator.ll
new file mode 100644
index 00000000000..6c7ac495799
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/discriminator.ll
@@ -0,0 +1,76 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPVEC_4_1 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=3 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPVEC_2_3 %s
+; RUN: opt -S -loop-unroll  -unroll-count=5 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPUNROLL_5 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -loop-unroll -unroll-count=2 < %s | FileCheck --check-prefix=DBG_VALUE --check-prefix=LOOPVEC_UNROLL %s
+
+; Test if vectorization/unroll factor is recorded in discriminator.
+;
+; Original source code:
+;  1 int *a;
+;  2 int *b;
+;  3 
+;  4 void foo() {
+;  5   for (int i = 0; i < 4096; i++)
+;  6     a[i] += b[i];
+;  7 }
+
+@a = local_unnamed_addr global i32* null, align 8
+@b = local_unnamed_addr global i32* null, align 8
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+define void @_Z3foov() local_unnamed_addr #0 !dbg !6 {
+  %1 = load i32*, i32** @b, align 8, !dbg !8, !tbaa !9
+  %2 = load i32*, i32** @a, align 8, !dbg !13, !tbaa !9
+  br label %3, !dbg !14
+
+; <label>:3:                                      ; preds = %3, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %3 ]
+  %4 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv, !dbg !8
+  %5 = load i32, i32* %4, align 4, !dbg !8, !tbaa !15
+  %6 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv, !dbg !13
+  %7 = load i32, i32* %6, align 4, !dbg !17, !tbaa !15
+  %8 = add nsw i32 %7, %5, !dbg !17
+;DBG_VALUE: call void @llvm.dbg.declare{{.*}}!dbg ![[DBG:[0-9]*]]
+  call void @llvm.dbg.declare(metadata i32 %8, metadata !22, metadata !DIExpression()), !dbg !17
+  store i32 %8, i32* %6, align 4, !dbg !17, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !18
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !19
+  br i1 %exitcond, label %9, label %3, !dbg !14, !llvm.loop !20
+
+; <label>:9:                                      ; preds = %3
+  ret void, !dbg !21
+}
+
+;DBG_VALUE: ![[TOP:[0-9]*]] = distinct !DISubprogram(name: "foo"
+;LOOPVEC_4_1: discriminator: 17
+;LOOPVEC_2_3: discriminator: 25
+;LOOPUNROLL_5: discriminator: 21
+; When unrolling after loop vectorize, both vec_body and remainder loop
+; are unrolled.
+;LOOPVEC_UNROLL: discriminator: 385
+;LOOPVEC_UNROLL: discriminator: 9
+;DBG_VALUE: ![[DBG]] = {{.*}}, scope: ![[TOP]]
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "a.cc", directory: "/")
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0)
+!8 = !DILocation(line: 6, column: 13, scope: !6)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"any pointer", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C++ TBAA"}
+!13 = !DILocation(line: 6, column: 5, scope: !6)
+!14 = !DILocation(line: 5, column: 3, scope: !6)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !11, i64 0}
+!17 = !DILocation(line: 6, column: 10, scope: !6)
+!18 = !DILocation(line: 5, column: 30, scope: !6)
+!19 = !DILocation(line: 5, column: 21, scope: !6)
+!20 = distinct !{!20, !14}
+!21 = !DILocation(line: 7, column: 1, scope: !6)
+!22 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 10)
diff --git a/llvm/test/Transforms/LoopVectorize/ee-crash.ll b/llvm/test/Transforms/LoopVectorize/ee-crash.ll
new file mode 100644
index 00000000000..8605ec24f7f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ee-crash.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; This test checks that we deal with an in-loop extractelement (for now, this
+; means not crashing by not vectorizing).
+; CHECK-LABEL: @_Z4foo1Pii(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+define i32 @_Z4foo1Pii(i32* %A, i32 %n, <2 x i32> %q) #0 {
+entry:
+  %idx.ext = sext i32 %n to i64
+  %add.ptr = getelementptr inbounds i32, i32* %A, i64 %idx.ext
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %entry, %for.body.i
+  %__init.addr.05.i = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %__first.addr.04.i = phi i32* [ %incdec.ptr.i, %for.body.i ], [ %A, %entry ]
+  %0 = load i32, i32* %__first.addr.04.i, align 4
+  %q1 = extractelement <2 x i32> %q, i32 %n
+  %q2 = add nsw i32 %0, %q1
+  %add.i = add nsw i32 %q2, %__init.addr.05.i
+  %incdec.ptr.i = getelementptr inbounds i32, i32* %__first.addr.04.i, i64 1
+  %cmp.i = icmp eq i32* %incdec.ptr.i, %add.ptr
+  br i1 %cmp.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+attributes #0 = { nounwind readonly ssp uwtable }
+
diff --git a/llvm/test/Transforms/LoopVectorize/exact.ll b/llvm/test/Transforms/LoopVectorize/exact.ll
new file mode 100644
index 00000000000..6860c0d04fb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/exact.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @lshr_exact(
+; CHECK: lshr exact <4 x i32>
+define void @lshr_exact(i32* %x) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %conv1 = lshr exact i32 %0, 1
+  store i32 %conv1, i32* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
new file mode 100644
index 00000000000..33527b307da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_detection.ll
@@ -0,0 +1,236 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that outer loops annotated only with the expected explicit
+; vectorization hints are collected for vectorization instead of inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable)
+;   for (i = 0; i < N; i++) {
+;     for (j = 0; j < M; j++) {
+;       a[i*M+j] = b[i*M+j] * b[i*M+j];
+;     }
+;   }
+; }
+
+; Case 1: Annotated outer loop WITH vector width information must be collected.
+
+; CHECK-LABEL: vector_width
+; CHECK: LV: Loop hints: force=enabled width=4 unroll=0
+; CHECK: LV: We can vectorize this outer loop!
+; CHECK: LV: Using user VF 4 to build VPlans.
+; CHECK-NOT: LV: Loop hints: force=?
+; CHECK-NOT: LV: Found a loop: inner.body
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @vector_width(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 2: Annotated outer loop WITHOUT vector width information must be collected.
+
+; CHECK-LABEL: case2
+; CHECK: LV: Loop hints: force=enabled width=0 unroll=0
+; CHECK: LV: We can vectorize this outer loop!
+; CHECK: LV: Using VF 1 to build VPlans.
+
+define void @case2(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                          ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                        ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                  ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                        ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !9
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 3: Annotated outer loop WITH vector width and interleave information
+; doesn't have to be collected.
+
+; CHECK-LABEL: case3
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case3(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                         ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                       ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                         ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                       ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body, !llvm.loop !11
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 4: Outer loop without any explicit vectorization annotation doesn't have
+; to be collected.
+
+; CHECK-LABEL: case4
+; CHECK-NOT: LV: Loop hints: force=enabled
+; CHECK-NOT: LV: We can vectorize this outer loop!
+; CHECK: LV: Loop hints: force=?
+; CHECK: LV: Found a loop: inner.body
+
+define void @case4(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                         ; preds = %entry
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                       ; preds = %outer.inc, %outer.ph
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:                                  ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:                                        ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                        ; preds = %inner.body, %outer.body
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body
+
+for.end15:                                        ; preds = %outer.inc, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+; Case 1
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 4}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
+; Case 2
+!9 = distinct !{!9, !8}
+; Case 3
+!10 = !{!"llvm.loop.interleave.count", i32 2}
+!11 = distinct !{!11, !7, !10, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
new file mode 100644
index 00000000000..d0ab58b30e3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_nonuniform_inner.ll
@@ -0,0 +1,177 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV bails out on explicit vectorization outer loops that contain
+; divergent inner loops.
+
+; Root C/C++ source code for all the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+;   for (i = 0; i < N; i++) {
+;     // Tested inner loop. It will be replaced per test.
+;     for (j = 0; j < M; j++) {
+;       a[i*M+j] = b[i*M+j] * b[i*M+j];
+;     }
+;   }
+; }
+
+; Case 1 (for (j = i; j < M; j++)): Inner loop with divergent IV start.
+
+; CHECK-LABEL: iv_start
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_start(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp33 = icmp sgt i32 %N, 0
+  br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count41 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+  %cmp231 = icmp slt i64 %indvars.iv38, %0
+  br i1 %cmp231, label %inner.ph, label %outer.inc
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv38, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv35 = phi i64 [ %indvars.iv38, %inner.ph ], [ %indvars.iv.next36, %inner.body ]
+  %2 = add nsw i64 %indvars.iv35, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond = icmp eq i64 %indvars.iv.next36, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+  br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+
+; Case 2 (for (j = 0; j < i; j++)): Inner loop with divergent upper-bound.
+
+; CHECK-LABEL: loop_ub
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @loop_ub(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %0 = sext i32 %M to i64
+  %wide.trip.count41 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv38 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next39, %outer.inc ]
+  %cmp230 = icmp eq i64 %indvars.iv38, 0
+  br i1 %cmp230, label %outer.inc, label %inner.ph
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv38, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.body, %inner.ph
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %indvars.iv38
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next39 = add nuw nsw i64 %indvars.iv38, 1
+  %exitcond42 = icmp eq i64 %indvars.iv.next39, %wide.trip.count41
+  br i1 %exitcond42, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+; Case 3 (for (j = 0; j < M; j+=i)): Inner loop with divergent step.
+
+; CHECK-LABEL: iv_step
+; CHECK: LV: Not vectorizing: Outer loop contains divergent loops.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @iv_step(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp33 = icmp sgt i32 %N, 0
+  br i1 %cmp33, label %outer.ph, label %for.end15
+
+outer.ph:                                   ; preds = %entry
+  %cmp231 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %for.inc14, %outer.ph
+  %indvars.iv39 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next40, %for.inc14 ]
+  br i1 %cmp231, label %inner.ph, label %for.inc14
+
+inner.ph:                                   ; preds = %outer.body
+  %1 = mul nsw i64 %indvars.iv39, %0
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv36 = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next37, %inner.body ]
+  %2 = add nsw i64 %indvars.iv36, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %mul8 = mul nsw i32 %3, %3
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %mul8, i32* %arrayidx12, align 4, !tbaa !2
+  %indvars.iv.next37 = add nuw nsw i64 %indvars.iv36, %indvars.iv39
+  %cmp2 = icmp slt i64 %indvars.iv.next37, %0
+  br i1 %cmp2, label %inner.body, label %for.inc14
+
+for.inc14:                                 ; preds = %inner.body, %outer.body
+  %indvars.iv.next40 = add nuw nsw i64 %indvars.iv39, 1
+  %exitcond = icmp eq i64 %indvars.iv.next40, %wide.trip.count
+  br i1 %exitcond, label %for.end15, label %outer.body, !llvm.loop !6
+
+for.end15:                                 ; preds = %for.inc14, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll b/llvm/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
new file mode 100644
index 00000000000..e05e9dd813b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/explicit_outer_uniform_diverg_branch.ll
@@ -0,0 +1,138 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Verify that LV can handle explicit vectorization outer loops with uniform branches
+; but bails out on outer loops with divergent branches.
+
+; Root C/C++ source code for the test cases
+; void foo(int *a, int *b, int N, int M)
+; {
+;   int i, j;
+; #pragma clang loop vectorize(enable) vectorize_width(8)
+;   for (i = 0; i < N; i++) {
+;     // Tested conditional branch. COND will be replaced per test.
+;     if (COND)
+;       for (j = 0; j < M; j++) {
+;         a[i*M+j] = b[i*M+j] * b[i*M+j];
+;       }
+;   }
+; }
+
+; Case 1 (COND => M == N): Outer loop with uniform conditional branch.
+
+; CHECK-LABEL: uniform_branch
+; CHECK: LV: We can vectorize this outer loop!
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @uniform_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp39 = icmp sgt i32 %N, 0
+  br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph:                                   ; preds = %entry
+  %cmp337 = icmp slt i32 %M, 1
+  %0 = sext i32 %M to i64
+  %N64 = zext i32 %N to i64
+  %M64 = zext i32 %M to i64
+  %cmp1 = icmp ne i32 %M, %N ; Uniform condition
+  %brmerge = or i1 %cmp1, %cmp337 ; Uniform condition
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+  %1 = mul nsw i64 %indvars.iv42, %0
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  br i1 %brmerge, label %outer.inc, label %inner.ph ; Supported uniform branch
+
+inner.ph:                                   ; preds = %outer.body
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+  %3 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+  %mul12 = mul nsw i32 %4, %4
+  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %M64
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+
+; Case 2 (COND => B[i * M] == 0): Outer loop with divergent conditional branch.
+
+; CHECK-LABEL: divergent_branch
+; CHECK: Unsupported conditional branch.
+; CHECK: LV: Not vectorizing: Unsupported outer loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @divergent_branch(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) local_unnamed_addr {
+entry:
+  %cmp39 = icmp sgt i32 %N, 0
+  br i1 %cmp39, label %outer.ph, label %for.end19
+
+outer.ph:                                   ; preds = %entry
+  %cmp337 = icmp slt i32 %M, 1
+  %0 = sext i32 %M to i64
+  %N64 = zext i32 %N to i64
+  %M64 = zext i32 %M to i64
+  br label %outer.body
+
+outer.body:                                 ; preds = %outer.inc, %outer.ph
+  %indvars.iv42 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next43, %outer.inc ]
+  %1 = mul nsw i64 %indvars.iv42, %0
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %cmp1 = icmp ne i32 %2, 0 ; Divergent condition
+  %brmerge = or i1 %cmp1, %cmp337 ; Divergent condition
+  br i1 %brmerge, label %outer.inc, label %inner.ph ; Unsupported divergent branch.
+
+inner.ph:                                   ; preds = %outer.body
+  br label %inner.body
+
+inner.body:                                 ; preds = %inner.ph, %inner.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %inner.body ], [ 0, %inner.ph ]
+  %3 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i64 %3
+  %4 = load i32, i32* %arrayidx7, align 4, !tbaa !2
+  %mul12 = mul nsw i32 %4, %4
+  %arrayidx16 = getelementptr inbounds i32, i32* %a, i64 %3
+  store i32 %mul12, i32* %arrayidx16, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %M64
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:                                  ; preds = %inner.body, %outer.body
+  %indvars.iv.next43 = add nuw nsw i64 %indvars.iv42, 1
+  %exitcond46 = icmp eq i64 %indvars.iv.next43, %N64
+  br i1 %exitcond46, label %for.end19, label %outer.body, !llvm.loop !6
+
+for.end19:                                  ; preds = %outer.inc, %entry
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.width", i32 8}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll b/llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll
new file mode 100644
index 00000000000..ae7ce70057f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/fcmp-vectorize.ll
@@ -0,0 +1,25 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+; Avoid crashing while trying to vectorize fcmp that can be folded to vector of
+; i1 true.
+define void @test1() {
+; CHECK-LABEL: test1(
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+; CHECK:         %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:         %index.next = add i32 %index, 4
+
+entry:
+  br label %loop
+
+loop:                                              ; preds = %loop, %entry
+  %iv = phi i32 [ 0, %entry ], [ %ivnext, %loop ]
+  %fcmp = fcmp uno float 0.000000e+00, 0.000000e+00
+  %ivnext = add nsw i32 %iv, 1
+  %cnd = icmp sgt i32 %iv, 142
+  br i1 %cnd, label %exit, label %loop
+
+exit:                                              ; preds = %loop
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
new file mode 100644
index 00000000000..998f412674b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -0,0 +1,574 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-VF
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s --check-prefix=SINK-AFTER
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s --check-prefix=NO-SINK-AFTER
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; void recurrence_1(int *a, int *b, int n) {
+;   for(int i = 0; i < n; i++)
+;     b[i] =  a[i] + a[i - 1]
+; }
+;
+; CHECK-LABEL: @recurrence_1(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_1(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
+  %pre_load = load i32, i32* %arrayidx.phi.trans.insert
+  br label %scalar.body
+
+scalar.body:
+  %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx32
+  %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %add35 = add i32 %1, %0
+  store i32 %add35, i32* %arrayidx34
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.exit, label %scalar.body
+
+for.exit:
+  ret void
+}
+
+; int recurrence_2(int *a, int n) {
+;   int minmax;
+;   for (int i = 0; i < n; ++i)
+;     minmax = min(minmax, max(a[i] - a[i-1], 0));
+;   return minmax;
+; }
+;
+; CHECK-LABEL: @recurrence_2(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_2(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
+entry:
+  %cmp27 = icmp sgt i32 %n, 0
+  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+
+for.preheader:
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  br label %scalar.body
+
+for.cond.cleanup.loopexit:
+  %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %minmax.0.lcssa
+
+scalar.body:
+  %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %sub3 = sub nsw i32 %1, %0
+  %cmp4 = icmp sgt i32 %sub3, 0
+  %cond = select i1 %cmp4, i32 %sub3, i32 0
+  %cmp5 = icmp slt i32 %minmax.028, %cond
+  %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
+}
+
+; void recurrence_3(short *a, double *b, int n, float f, short p) {
+;   b[0] = (double)a[0] - f * (double)p;
+;   for (int i = 1; i < n; i++)
+;     b[i] = (double)a[i] - f * (double)a[i - 1];
+; }
+;
+; CHECK-LABEL: @recurrence_3(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i16>
+; CHECK:         [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; Check also that the casts were not moved needlessly.
+; CHECK:         sitofp <4 x i16> [[L1]] to <4 x double>
+; CHECK:         sitofp <4 x i16> [[SHUF]] to <4 x double> 
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_3(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
+; UNROLL:         [[L2]] = load <4 x i16>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
+;
+define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %conv = sitofp i16 %0 to double
+  %conv1 = fpext float %f to double
+  %conv2 = sitofp i16 %p to double
+  %mul = fmul fast double %conv2, %conv1
+  %sub = fsub fast double %conv, %mul
+  store double %sub, double* %b, align 8
+  %cmp25 = icmp sgt i32 %n, 1
+  br i1 %cmp25, label %for.preheader, label %for.end
+
+for.preheader:
+  br label %scalar.body
+
+scalar.body:
+  %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
+  %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
+  %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
+  %2 = load i16, i16* %arrayidx5, align 2
+  %conv6 = sitofp i16 %2 to double
+  %conv11 = sitofp i16 %1 to double
+  %mul12 = fmul fast double %conv11, %conv1
+  %sub13 = fsub fast double %conv6, %mul12
+  %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
+  store double %sub13, double* %arrayidx15, align 8
+  %advars.iv.next = add nuw nsw i64 %advars.iv, 1
+  %lftr.wideiv = trunc i64 %advars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %scalar.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; void PR26734(short *a, int *b, int *c, int d, short *e) {
+;   for (; d != 21; d++) {
+;     *b &= *c;
+;     *e = *a - 6;
+;     *c = *e;
+;   }
+; }
+;
+; CHECK-LABEL: @PR26734(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
+entry:
+  %cmp4 = icmp eq i32 %d, 21
+  br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
+
+entry.for.end_crit_edge:
+  %.pre = load i32, i32* %b, align 4
+  br label %for.end
+
+for.body.lr.ph:
+  %0 = load i16, i16* %a, align 2
+  %sub = add i16 %0, -6
+  %conv2 = sext i16 %sub to i32
+  %c.promoted = load i32, i32* %c, align 4
+  %b.promoted = load i32, i32* %b, align 4
+  br label %for.body
+
+for.body:
+  %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
+  %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
+  %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
+  %and = and i32 %and6, %conv25
+  %inc = add nsw i32 %inc7, 1
+  %cmp = icmp eq i32 %inc, 21
+  br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %and.lcssa = phi i32 [ %and, %for.body ]
+  store i32 %conv2, i32* %c, align 4
+  store i32 %and.lcssa, i32* %b, align 4
+  store i16 %sub, i16* %e, align 2
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; int PR27246() {
+;   unsigned int e, n;
+;   for (int i = 1; i < 49; ++i) {
+;     for (int k = i; k > 1; --k)
+;       e = k;
+;     n = e;
+;   }
+;   return n;
+; }
+;
+; CHECK-LABEL: @PR27246(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define i32 @PR27246() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
+  %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
+  br label %for.cond1
+
+for.cond.cleanup:
+  %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
+  ret i32 %e.1.lcssa.lcssa
+
+for.cond1:
+  %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
+  %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
+  %cmp2 = icmp sgt i32 %k.0, 1
+  %dec = add nsw i32 %k.0, -1
+  br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
+
+for.cond.cleanup3:
+  %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond = icmp eq i32 %inc, 49
+  br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
+}
+
+; UNROLL-NO-IC-LABEL: @PR30183(
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC:         [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
+; UNROLL-NO-IC-NEXT:    br label %vector.body
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], %vector.ph ], [ [[TMP42:%.*]], %vector.body ]
+; UNROLL-NO-IC:         [[TMP27:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP28]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP29]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP30]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP32]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP33]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP42]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP34]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP38]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @PR30183(i32 %pre_load, i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp0 = phi i32 [ %pre_load, %entry ], [ %tmp2, %scalar.body ]
+  %i.next = add nuw nsw i64 %i, 2
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i.next
+  %tmp2 = load i32, i32* %tmp1
+  %cond = icmp eq i64 %i.next,%n
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; UNROLL-NO-IC-LABEL: @constant_folded_previous_value(
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC:         [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 undef, i64 undef, i64 undef, i64 0>, %vector.ph ], [ <i64 1, i64 1, i64 1, i64 1>, %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @constant_folded_previous_value() {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp2 = phi i64 [ 0, %entry ], [ %tmp3, %scalar.body ]
+  %tmp3 = add i64 0, 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, undef
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; We vectorize this first order recurrence, by generating two
+; extracts for the phi `val.phi` - one at the last index and 
+; another at the second last index. We need these 2 extracts because 
+; the first order recurrence phi is used outside the loop, so we require the phi
+; itself and not its update (addx).
+; UNROLL-NO-IC-LABEL: extract_second_last_iteration
+; UNROLL-NO-IC: vector.body
+; UNROLL-NO-IC:   %step.add = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC:   %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat
+; UNROLL-NO-IC:   %[[L2:.+]] = add <4 x i32> %step.add, %broadcast.splat
+; UNROLL-NO-IC:   %index.next = add i32 %index, 8
+; UNROLL-NO-IC:   icmp eq i32 %index.next, 96
+; UNROLL-NO-IC: middle.block
+; UNROLL-NO-IC:   icmp eq i32 96, 96
+; UNROLL-NO-IC:   %vector.recur.extract = extractelement <4 x i32> %[[L2]], i32 3
+; UNROLL-NO-IC:   %vector.recur.extract.for.phi = extractelement <4 x i32> %[[L2]], i32 2
+; UNROLL-NO-IC: for.end
+; UNROLL-NO-IC:   %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
+; Check the case when unrolled but not vectorized.
+; UNROLL-NO-VF-LABEL: extract_second_last_iteration
+; UNROLL-NO-VF: vector.body:
+; UNROLL-NO-VF:   %induction = add i32 %index, 0
+; UNROLL-NO-VF:   %induction1 = add i32 %index, 1
+; UNROLL-NO-VF:   %[[L1:.+]] = add i32 %induction, %x
+; UNROLL-NO-VF:   %[[L2:.+]] = add i32 %induction1, %x
+; UNROLL-NO-VF:   %index.next = add i32 %index, 2
+; UNROLL-NO-VF:   icmp eq i32 %index.next, 96
+; UNROLL-NO-VF: for.end:
+; UNROLL-NO-VF:   %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %[[L1]], %middle.block ]
+define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add i32 %inc.phi, 1
+  %bc = zext i32 %inc.phi to i64
+  %addx = add i32 %inc.phi, %x
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %val.phi
+}
+
+; We vectorize this first order recurrence, with a set of insertelements for
+; each unrolled part. Make sure these insertelements are generated in-order,
+; because the shuffle of the first order recurrence will be added after the
+; insertelement of the last part UF - 1, assuming the latter appears after the
+; insertelements of all other parts.
+;
+; int PR33613(double *b, double j, int d) {
+;   int a = 0;
+;   for(int i = 0; i < 10240; i++, b+=25) {
+;     double f = b[d]; // Scalarize to form insertelements
+;     if (j * f)
+;       a++;
+;     j = f;
+;   }
+;   return a;
+; }
+;
+; UNROLL-NO-IC-LABEL: @PR33613(
+; UNROLL-NO-IC:     vector.body:
+; UNROLL-NO-IC:       [[VECTOR_RECUR:%.*]] = phi <4 x double>
+; UNROLL-NO-IC:       shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> {{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:  shufflevector <4 x double> {{.*}}, <4 x double> {{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NOT:   insertelement <4 x double>
+; UNROLL-NO-IC:     middle.block:
+;
+define i32 @PR33613(double* %b, double %j, i32 %d) {
+entry:
+  %idxprom = sext i32 %d to i64
+  br label %for.body
+
+for.cond.cleanup:
+  %a.1.lcssa = phi i32 [ %a.1, %for.body ]
+  ret i32 %a.1.lcssa
+
+for.body:
+  %b.addr.012 = phi double* [ %b, %entry ], [ %add.ptr, %for.body ]
+  %i.011 = phi i32 [ 0, %entry ], [ %inc1, %for.body ]
+  %a.010 = phi i32 [ 0, %entry ], [ %a.1, %for.body ]
+  %j.addr.09 = phi double [ %j, %entry ], [ %0, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %b.addr.012, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %mul = fmul double %j.addr.09, %0
+  %tobool = fcmp une double %mul, 0.000000e+00
+  %inc = zext i1 %tobool to i32
+  %a.1 = add nsw i32 %a.010, %inc
+  %inc1 = add nuw nsw i32 %i.011, 1
+  %add.ptr = getelementptr inbounds double, double* %b.addr.012, i64 25
+  %exitcond = icmp eq i32 %inc1, 10240
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; void sink_after(short *a, int n, int *b) {
+;   for(int i = 0; i < n; i++)
+;     b[i] = (a[i] * a[i + 1]);
+; }
+;
+; SINK-AFTER-LABEL: sink_after
+; Check that the sext sank after the load in the vector loop.
+; SINK-AFTER: vector.body
+; SINK-AFTER:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; SINK-AFTER:   %wide.load = load <4 x i16>
+; SINK-AFTER:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER:   %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32>
+; SINK-AFTER:   %[[VCONV3:.+]] = sext <4 x i16> %wide.load to <4 x i32>
+; SINK-AFTER:   mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]]
+;
+define void @sink_after(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sext i16 %0 to i32
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i16, i16* %a, i64 %indvars.iv.next
+  %1 = load i16, i16* %arrayidx2
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; PR34711: given three consecutive instructions such that the first will be
+; widened, the second is a cast that will be widened and needs to sink after the
+; third, and the third is a first-order-recurring load that will be replicated
+; instead of widened. Although the cast and the first instruction will both be
+; widened, and are originally adjacent to each other, make sure the replicated
+; load ends up appearing between them.
+;
+; void PR34711(short[2] *a, int *b, int *c, int n) {
+;   for(int i = 0; i < n; i++) {
+;     c[i] = 7;
+;     b[i] = (a[i][0] * a[i][1]);
+;   }
+; }
+;
+; SINK-AFTER-LABEL: @PR34711
+; Check that the sext sank after the load in the vector loop.
+; SINK-AFTER: vector.body
+; SINK-AFTER:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ {{.*}}, %vector.body ]
+; SINK-AFTER:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %{{.*}}, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER:   %[[VCONV:.+]] = sext <4 x i16> %[[VSHUF]] to <4 x i32>
+; SINK-AFTER:   %[[VCONV3:.+]] = sext <4 x i16> {{.*}} to <4 x i32>
+; SINK-AFTER:   mul nsw <4 x i32> %[[VCONV3]], %[[VCONV]]
+;
+define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
+entry:
+  %pre.index = getelementptr inbounds [2 x i16], [2 x i16]* %a, i64 0, i64 0
+  %.pre = load i16, i16* %pre.index
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arraycidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %cur.index = getelementptr inbounds [2 x i16], [2 x i16]* %a, i64 %indvars.iv, i64 1
+  store i32 7, i32* %arraycidx   ; 1st instruction, to be widened.
+  %conv = sext i16 %0 to i32     ; 2nd, cast to sink after third.
+  %1 = load i16, i16* %cur.index ; 3rd, first-order-recurring load not widened.
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; void no_sink_after(short *a, int n, int *b) {
+;   for(int i = 0; i < n; i++)
+;     b[i] = ((a[i] + 2) * a[i + 1]);
+; }
+;
+; NO-SINK-AFTER-LABEL: no_sink_after
+; NO-SINK-AFTER-NOT:   vector.ph:
+; NO-SINK-AFTER:       }
+;
+define void @no_sink_after(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %for.body
+
+for.body:
+  %0 = phi i16 [ %.pre, %entry ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sext i16 %0 to i32
+  %add = add nsw i32 %conv, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i16, i16* %a, i64 %indvars.iv.next
+  %1 = load i16, i16* %arrayidx2
+  %conv3 = sext i16 %1 to i32
+  %mul = mul nsw i32 %add, %conv3
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx5
+  %exitcond = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/flags.ll b/llvm/test/Transforms/LoopVectorize/flags.ll
new file mode 100644
index 00000000000..f1b122d6678
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/flags.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @flags1(
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags1(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = mul nsw i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK-LABEL: @flags2(
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = mul i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+; Make sure we copy fast math flags and use them for the final reduction.
+; CHECK-LABEL: fast_math
+; CHECK: load <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: br
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+define float @fast_math(float* noalias %s) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %q.04 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %s, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %q.04, %0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  %add.lcssa = phi float [ %add, %for.body ]
+  ret float %add.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
new file mode 100644
index 00000000000..7d22ba18a28
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -0,0 +1,340 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -simplifycfg -keep-loops=false -S | FileCheck --check-prefix VEC2_INTERL1_PRED_STORE %s
+
+@fp_inc = common global float 0.000000e+00, align 4
+
+;void fp_iv_loop1(float init, float * __restrict__ A, int N) {
+;  float x = init;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x -= fp_inc;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop1(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP9]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC4_INTERL2-LABEL: @fp_iv_loop1(
+; VEC4_INTERL2:       vector.ph:
+; VEC4_INTERL2:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    br label %vector.body
+; VEC4_INTERL2:       vector.body:
+; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]]
+; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4
+; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4
+; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]]
+; VEC4_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC1_INTERL2-LABEL: @fp_iv_loop1(
+; VEC1_INTERL2:       vector.ph:
+; VEC1_INTERL2:         br label %vector.body
+; VEC1_INTERL2:       vector.body:
+; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC1_INTERL2-NEXT:    [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1
+; VEC1_INTERL2-NEXT:    [[TMP6:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC1_INTERL2-NEXT:    [[TMP7:%.*]] = fmul fast float %fpinc, [[TMP6]]
+; VEC1_INTERL2-NEXT:    [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP7]]
+; VEC1_INTERL2-NEXT:    [[TMP8:%.*]] = fsub fast float [[FP_OFFSET_IDX]], %fpinc
+; VEC1_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC1_INTERL2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDUCTION2]]
+; VEC1_INTERL2-NEXT:    store float [[FP_OFFSET_IDX]], float* [[TMP9]], align 4
+; VEC1_INTERL2-NEXT:    store float [[TMP8]], float* [[TMP10]], align 4
+; VEC1_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC1_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %fpinc = load float, float* @fp_inc, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %x.05 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.05, float* %arrayidx, align 4
+  %add = fsub fast float %x.05, %fpinc
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+;void fp_iv_loop2(float init, float * __restrict__ A, int N) {
+;  float x = init;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop2(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ %init, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd fast float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+;void fp_iv_loop3(float init, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, int N) {
+;  int i = 0;
+;  float x = init;
+;  float y = 0.1;
+;  for (; i < N; ++i) {
+;    A[i] = x;
+;    x += fp_inc;
+;    y -= 0.5;
+;    B[i] = x + y;
+;    C[i] = y;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop3(
+; VEC4_INTERL1:       for.body.lr.ph:
+; VEC4_INTERL1:         [[TMP0:%.*]] = load float, float* @fp_inc, align 4
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0x3FB99999A0000000, float 0xBFD99999A0000000, float 0xBFECCCCCC0000000, float 0xBFF6666660000000>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP13]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]]
+; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
+; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]]
+; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP16]], <4 x float>* [[TMP18]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* %C, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 {
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load float, float* @fp_inc, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %y.012 = phi float [ 0x3FB99999A0000000, %for.body.lr.ph ], [ %conv1, %for.body ]
+  %x.011 = phi float [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.011, float* %arrayidx, align 4
+  %add = fadd fast float %x.011, %0
+  %conv1 = fadd fast float %y.012, -5.000000e-01
+  %add2 = fadd fast float %conv1, %add
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %add2, float* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds float, float* %C, i64 %indvars.iv
+  store float %conv1, float* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 
+  br label %for.end
+
+for.end: 
+  ret void
+}
+
+; Start and step values are constants. There is no 'fmul' operation in this case
+;void fp_iv_loop4(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; VEC4_INTERL1-LABEL: @fp_iv_loop4(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1:         br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd fast float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar(
+; VEC2_INTERL1_PRED_STORE:       vector.body:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP4:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP1]], float* [[TMP2]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF6]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP10]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP9]], float* [[TMP11]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE7]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC2_INTERL1_PRED_STORE:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_float_scalar(float* %A, i64 %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.inc ], [ 0, %entry ]
+  %j = phi float [ %j.next, %for.inc ], [ 0.0, %entry ]
+  %tmp0 = getelementptr inbounds float, float* %A, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fcmp fast oeq float %tmp1, 0.0
+  br i1 %tmp2, label %if.pred, label %for.inc
+
+if.pred:
+  store float %j, float* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = fadd fast float %j, 1.0
+  %cond = icmp slt i64 %i.next, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/float-reduction.ll b/llvm/test/Transforms/LoopVectorize/float-reduction.ll
new file mode 100644
index 00000000000..f3b95d0ead7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/float-reduction.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+;CHECK-LABEL: @foo(
+;CHECK: fadd fast <4 x float>
+;CHECK: ret
+define float @foo(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.04 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %sum.04, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret float %add
+}
+
+;CHECK-LABEL: @foosub(
+;CHECK: fsub fast <4 x float>
+;CHECK: ret
+define float @foosub(float* nocapture %A, i32* nocapture %n) nounwind uwtable readonly ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.04 = phi float [ 0.000000e+00, %entry ], [ %sub, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %sub = fsub fast float %sum.04, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret float %sub
+}
diff --git a/llvm/test/Transforms/LoopVectorize/followup.ll b/llvm/test/Transforms/LoopVectorize/followup.ll
new file mode 100644
index 00000000000..a075061876f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/followup.ll
@@ -0,0 +1,43 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S < %s | FileCheck %s
+;
+; Check that the followup loop attributes are applied.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @followup(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !3, !4, !5}
+!3 = !{!"llvm.loop.vectorize.followup_vectorized", !{!"FollowupVectorized"}}
+!4 = !{!"llvm.loop.vectorize.followup_epilogue", !{!"FollowupEpilogue"}}
+!5 = !{!"llvm.loop.vectorize.followup_all", !{!"FollowupAll"}}
+
+
+; CHECK-LABEL @followup(
+
+; CHECK-LABEL: vector.body:
+; CHECK: br i1 %13, label %middle.block, label %vector.body, !llvm.loop ![[LOOP_VECTOR:[0-9]+]]
+; CHECK-LABEL: for.body:
+; CHECK: br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop ![[LOOP_EPILOGUE:[0-9]+]]
+
+; CHECK: ![[LOOP_VECTOR]] = distinct !{![[LOOP_VECTOR]], ![[FOLLOWUP_ALL:[0-9]+]], ![[FOLLOWUP_VECTORIZED:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_ALL]] = !{!"FollowupAll"}
+; CHECK: ![[FOLLOWUP_VECTORIZED:[0-9]+]] = !{!"FollowupVectorized"}
+; CHECK: ![[LOOP_EPILOGUE]] = distinct !{![[LOOP_EPILOGUE]], ![[FOLLOWUP_ALL]], ![[FOLLOWUP_EPILOGUE:[0-9]+]]}
+; CHECK: ![[FOLLOWUP_EPILOGUE]] = !{!"FollowupEpilogue"}
diff --git a/llvm/test/Transforms/LoopVectorize/funcall.ll b/llvm/test/Transforms/LoopVectorize/funcall.ll
new file mode 100644
index 00000000000..35c2dfca4b2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/funcall.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can vectorize loops with functions to math library functions.
+; They might read the rounding mode but we are only vectorizing loops that
+; contain a limited set of function calls and none of them sets the rounding
+; mode, so vectorizing them is safe.
+
+; CHECK-LABEL: @test(
+; CHECK: <2 x double>
+
+define void @test(double* %d, double %t) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %d, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %1 = tail call double @llvm.pow.f64(double %0, double %t)
+  store double %1, double* %arrayidx, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+declare double @llvm.pow.f64(double, double)
diff --git a/llvm/test/Transforms/LoopVectorize/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
new file mode 100644
index 00000000000..96a266dd5ba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -0,0 +1,685 @@
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example1(
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example2(
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example2(
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32, i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK-LABEL: @example3(
+;CHECK: <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example3(
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: <4 x i32>
+;UNROLL: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32, i32* %.023, i64 1
+  %4 = load i32, i32* %.023, align 16
+  %5 = getelementptr inbounds i32, i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+;CHECK-LABEL: @example4(
+;CHECK: load <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example4(
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: ret void
+define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = add nsw i32 %n, -1
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %.preheader4, label %.lr.ph10
+
+.preheader4:                                      ; preds = %0
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %.lr.ph6, label %._crit_edge
+
+.lr.ph10:                                         ; preds = %0, %.lr.ph10
+  %4 = phi i32 [ %9, %.lr.ph10 ], [ %1, %0 ]
+  %.018 = phi i32* [ %8, %.lr.ph10 ], [ %p, %0 ]
+  %.027 = phi i32* [ %5, %.lr.ph10 ], [ %q, %0 ]
+  %5 = getelementptr inbounds i32, i32* %.027, i64 1
+  %6 = load i32, i32* %.027, align 16
+  %7 = add nsw i32 %6, 5
+  %8 = getelementptr inbounds i32, i32* %.018, i64 1
+  store i32 %7, i32* %.018, align 16
+  %9 = add nsw i32 %4, -1
+  %10 = icmp eq i32 %4, 0
+  br i1 %10, label %._crit_edge, label %.lr.ph10
+
+.preheader:                                       ; preds = %.lr.ph6
+  br i1 %3, label %.lr.ph, label %._crit_edge
+
+.lr.ph6:                                          ; preds = %.preheader4, %.lr.ph6
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %.lr.ph6 ], [ 0, %.preheader4 ]
+  %indvars.iv.next12 = add i64 %indvars.iv11, 1
+  %11 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv.next12
+  %12 = load i32, i32* %11, align 4
+  %13 = add nsw i64 %indvars.iv11, 3
+  %14 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %13
+  %15 = load i32, i32* %14, align 4
+  %16 = add nsw i32 %15, %12
+  %17 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv11
+  store i32 %16, i32* %17, align 4
+  %lftr.wideiv13 = trunc i64 %indvars.iv.next12 to i32
+  %exitcond14 = icmp eq i32 %lftr.wideiv13, %1
+  br i1 %exitcond14, label %.preheader, label %.lr.ph6
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.preheader ]
+  %18 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %19 = load i32, i32* %18, align 4
+  %20 = icmp sgt i32 %19, 4
+  %21 = select i1 %20, i32 4, i32 0
+  %22 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  store i32 %21, i32* %22, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %1
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph10, %.preheader4, %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK-LABEL: @example8(
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example8(
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example8(i32 %x) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %3, %0
+  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %3 ]
+  br label %1
+
+; <label>:1                                       ; preds = %1, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [32 x [1024 x i32]], [32 x [1024 x i32]]* @G, i64 0, i64 %indvars.iv3, i64 %indvars.iv
+  store i32 %x, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %3, label %1
+
+; <label>:3                                       ; preds = %1
+  %indvars.iv.next4 = add i64 %indvars.iv3, 1
+  %lftr.wideiv5 = trunc i64 %indvars.iv.next4 to i32
+  %exitcond6 = icmp eq i32 %lftr.wideiv5, 32
+  br i1 %exitcond6, label %4, label %.preheader
+
+; <label>:4                                       ; preds = %3
+  ret void
+}
+
+;CHECK-LABEL: @example9(
+;CHECK: phi <4 x i32>
+;CHECK: ret i32
+define i32 @example9() nounwind uwtable readonly ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %diff.01 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @ub, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [1024 x i32], [1024 x i32]* @uc, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add i32 %3, %diff.01
+  %7 = sub i32 %6, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i32 %7
+}
+
+;CHECK-LABEL: @example10a(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: load <4 x i16>
+;CHECK: add <4 x i16>
+;CHECK: store <4 x i16>
+;CHECK: ret void
+define void @example10a(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, i32* %ib, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %ic, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %8 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %9 = load i16, i16* %8, align 2
+  %10 = getelementptr inbounds i16, i16* %sc, i64 %indvars.iv
+  %11 = load i16, i16* %10, align 2
+  %12 = add i16 %11, %9
+  %13 = getelementptr inbounds i16, i16* %sa, i64 %indvars.iv
+  store i16 %12, i16* %13, align 2
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %14, label %1
+
+; <label>:14                                      ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example11(
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @example11() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = shl nsw i64 %indvars.iv, 1
+  %3 = or i64 %2, 1
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %3
+  %5 = load i32, i32* %4, align 4
+  %6 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %3
+  %7 = load i32, i32* %6, align 4
+  %8 = mul nsw i32 %7, %5
+  %9 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %2
+  %10 = load i32, i32* %9, align 8
+  %11 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %2
+  %12 = load i32, i32* %11, align 8
+  %13 = mul nsw i32 %12, %10
+  %14 = sub nsw i32 %8, %13
+  %15 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %14, i32* %15, align 4
+  %16 = mul nsw i32 %7, %10
+  %17 = mul nsw i32 %12, %5
+  %18 = add nsw i32 %17, %16
+  %19 = getelementptr inbounds [2048 x i32], [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  store i32 %18, i32* %19, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %20, label %1
+
+; <label>:20                                      ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example12(
+;CHECK: %vec.ind1 = phi <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example13(
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %14, %0
+  %indvars.iv4 = phi i64 [ 0, %0 ], [ %indvars.iv.next5, %14 ]
+  %1 = getelementptr inbounds i32*, i32** %A, i64 %indvars.iv4
+  %2 = load i32*, i32** %1, align 8
+  %3 = getelementptr inbounds i32*, i32** %B, i64 %indvars.iv4
+  %4 = load i32*, i32** %3, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %.preheader, %5
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %5 ]
+  %diff.02 = phi i32 [ 0, %.preheader ], [ %11, %5 ]
+  %6 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv
+  %7 = load i32, i32* %6, align 4
+  %8 = getelementptr inbounds i32, i32* %4, i64 %indvars.iv
+  %9 = load i32, i32* %8, align 4
+  %10 = add i32 %7, %diff.02
+  %11 = sub i32 %10, %9
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %12 = trunc i64 %indvars.iv.next to i32
+  %13 = icmp slt i32 %12, 1024
+  br i1 %13, label %5, label %14
+
+; <label>:14                                      ; preds = %5
+  %15 = getelementptr inbounds i32, i32* %out, i64 %indvars.iv4
+  store i32 %11, i32* %15, align 4
+  %indvars.iv.next5 = add i64 %indvars.iv4, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next5 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 32
+  br i1 %exitcond, label %16, label %.preheader
+
+; <label>:16                                      ; preds = %14
+  ret void
+}
+
+; Can vectorize.
+;CHECK-LABEL: @example14(
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocapture %out) nounwind uwtable ssp {
+.preheader3:
+  br label %.preheader
+
+.preheader:                                       ; preds = %11, %.preheader3
+  %indvars.iv7 = phi i64 [ 0, %.preheader3 ], [ %indvars.iv.next8, %11 ]
+  %sum.05 = phi i32 [ 0, %.preheader3 ], [ %10, %11 ]
+  br label %0
+
+; <label>:0                                       ; preds = %0, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %0 ]
+  %sum.12 = phi i32 [ %sum.05, %.preheader ], [ %10, %0 ]
+  %1 = getelementptr inbounds i32*, i32** %in, i64 %indvars.iv
+  %2 = load i32*, i32** %1, align 8
+  %3 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv7
+  %4 = load i32, i32* %3, align 4
+  %5 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv
+  %6 = load i32*, i32** %5, align 8
+  %7 = getelementptr inbounds i32, i32* %6, i64 %indvars.iv7
+  %8 = load i32, i32* %7, align 4
+  %9 = mul nsw i32 %8, %4
+  %10 = add nsw i32 %9, %sum.12
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %11, label %0
+
+; <label>:11                                      ; preds = %0
+  %indvars.iv.next8 = add i64 %indvars.iv7, 1
+  %lftr.wideiv9 = trunc i64 %indvars.iv.next8 to i32
+  %exitcond10 = icmp eq i32 %lftr.wideiv9, 32
+  br i1 %exitcond10, label %.preheader3.1, label %.preheader
+
+.preheader3.1:                                    ; preds = %11
+  store i32 %10, i32* %out, align 4
+  br label %.preheader.1
+
+.preheader.1:                                     ; preds = %24, %.preheader3.1
+  %indvars.iv7.1 = phi i64 [ 0, %.preheader3.1 ], [ %indvars.iv.next8.1, %24 ]
+  %sum.05.1 = phi i32 [ 0, %.preheader3.1 ], [ %23, %24 ]
+  br label %12
+
+; <label>:12                                      ; preds = %12, %.preheader.1
+  %indvars.iv.1 = phi i64 [ 0, %.preheader.1 ], [ %13, %12 ]
+  %sum.12.1 = phi i32 [ %sum.05.1, %.preheader.1 ], [ %23, %12 ]
+  %13 = add nsw i64 %indvars.iv.1, 1
+  %14 = getelementptr inbounds i32*, i32** %in, i64 %13
+  %15 = load i32*, i32** %14, align 8
+  %16 = getelementptr inbounds i32, i32* %15, i64 %indvars.iv7.1
+  %17 = load i32, i32* %16, align 4
+  %18 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv.1
+  %19 = load i32*, i32** %18, align 8
+  %20 = getelementptr inbounds i32, i32* %19, i64 %indvars.iv7.1
+  %21 = load i32, i32* %20, align 4
+  %22 = mul nsw i32 %21, %17
+  %23 = add nsw i32 %22, %sum.12.1
+  %lftr.wideiv.1 = trunc i64 %13 to i32
+  %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 1024
+  br i1 %exitcond.1, label %24, label %12
+
+; <label>:24                                      ; preds = %12
+  %indvars.iv.next8.1 = add i64 %indvars.iv7.1, 1
+  %lftr.wideiv9.1 = trunc i64 %indvars.iv.next8.1 to i32
+  %exitcond10.1 = icmp eq i32 %lftr.wideiv9.1, 32
+  br i1 %exitcond10.1, label %.preheader3.2, label %.preheader.1
+
+.preheader3.2:                                    ; preds = %24
+  %25 = getelementptr inbounds i32, i32* %out, i64 1
+  store i32 %23, i32* %25, align 4
+  br label %.preheader.2
+
+.preheader.2:                                     ; preds = %38, %.preheader3.2
+  %indvars.iv7.2 = phi i64 [ 0, %.preheader3.2 ], [ %indvars.iv.next8.2, %38 ]
+  %sum.05.2 = phi i32 [ 0, %.preheader3.2 ], [ %37, %38 ]
+  br label %26
+
+; <label>:26                                      ; preds = %26, %.preheader.2
+  %indvars.iv.2 = phi i64 [ 0, %.preheader.2 ], [ %indvars.iv.next.2, %26 ]
+  %sum.12.2 = phi i32 [ %sum.05.2, %.preheader.2 ], [ %37, %26 ]
+  %27 = add nsw i64 %indvars.iv.2, 2
+  %28 = getelementptr inbounds i32*, i32** %in, i64 %27
+  %29 = load i32*, i32** %28, align 8
+  %30 = getelementptr inbounds i32, i32* %29, i64 %indvars.iv7.2
+  %31 = load i32, i32* %30, align 4
+  %32 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv.2
+  %33 = load i32*, i32** %32, align 8
+  %34 = getelementptr inbounds i32, i32* %33, i64 %indvars.iv7.2
+  %35 = load i32, i32* %34, align 4
+  %36 = mul nsw i32 %35, %31
+  %37 = add nsw i32 %36, %sum.12.2
+  %indvars.iv.next.2 = add i64 %indvars.iv.2, 1
+  %lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32
+  %exitcond.2 = icmp eq i32 %lftr.wideiv.2, 1024
+  br i1 %exitcond.2, label %38, label %26
+
+; <label>:38                                      ; preds = %26
+  %indvars.iv.next8.2 = add i64 %indvars.iv7.2, 1
+  %lftr.wideiv9.2 = trunc i64 %indvars.iv.next8.2 to i32
+  %exitcond10.2 = icmp eq i32 %lftr.wideiv9.2, 32
+  br i1 %exitcond10.2, label %.preheader3.3, label %.preheader.2
+
+.preheader3.3:                                    ; preds = %38
+  %39 = getelementptr inbounds i32, i32* %out, i64 2
+  store i32 %37, i32* %39, align 4
+  br label %.preheader.3
+
+.preheader.3:                                     ; preds = %52, %.preheader3.3
+  %indvars.iv7.3 = phi i64 [ 0, %.preheader3.3 ], [ %indvars.iv.next8.3, %52 ]
+  %sum.05.3 = phi i32 [ 0, %.preheader3.3 ], [ %51, %52 ]
+  br label %40
+
+; <label>:40                                      ; preds = %40, %.preheader.3
+  %indvars.iv.3 = phi i64 [ 0, %.preheader.3 ], [ %indvars.iv.next.3, %40 ]
+  %sum.12.3 = phi i32 [ %sum.05.3, %.preheader.3 ], [ %51, %40 ]
+  %41 = add nsw i64 %indvars.iv.3, 3
+  %42 = getelementptr inbounds i32*, i32** %in, i64 %41
+  %43 = load i32*, i32** %42, align 8
+  %44 = getelementptr inbounds i32, i32* %43, i64 %indvars.iv7.3
+  %45 = load i32, i32* %44, align 4
+  %46 = getelementptr inbounds i32*, i32** %coeff, i64 %indvars.iv.3
+  %47 = load i32*, i32** %46, align 8
+  %48 = getelementptr inbounds i32, i32* %47, i64 %indvars.iv7.3
+  %49 = load i32, i32* %48, align 4
+  %50 = mul nsw i32 %49, %45
+  %51 = add nsw i32 %50, %sum.12.3
+  %indvars.iv.next.3 = add i64 %indvars.iv.3, 1
+  %lftr.wideiv.3 = trunc i64 %indvars.iv.next.3 to i32
+  %exitcond.3 = icmp eq i32 %lftr.wideiv.3, 1024
+  br i1 %exitcond.3, label %52, label %40
+
+; <label>:52                                      ; preds = %40
+  %indvars.iv.next8.3 = add i64 %indvars.iv7.3, 1
+  %lftr.wideiv9.3 = trunc i64 %indvars.iv.next8.3 to i32
+  %exitcond10.3 = icmp eq i32 %lftr.wideiv9.3, 32
+  br i1 %exitcond10.3, label %53, label %.preheader.3
+
+; <label>:53                                      ; preds = %52
+  %54 = getelementptr inbounds i32, i32* %out, i64 3
+  store i32 %51, i32* %54, align 4
+  ret void
+}
+
+;CHECK-LABEL: @example21(
+;CHECK: load <4 x i32>
+;CHECK: shufflevector {{.*}} <i32 3, i32 2, i32 1, i32 0>
+;CHECK: ret i32
+define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i32 %n to i64
+  br label %3
+
+; <label>:3                                       ; preds = %.lr.ph, %3
+  %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %a.02 = phi i32 [ 0, %.lr.ph ], [ %6, %3 ]
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %4 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %a.02
+  %7 = trunc i64 %indvars.iv.next to i32
+  %8 = icmp sgt i32 %7, 0
+  br i1 %8, label %3, label %._crit_edge
+
+._crit_edge:                                      ; preds = %3, %0
+  %a.0.lcssa = phi i32 [ 0, %0 ], [ %6, %3 ]
+  ret i32 %a.0.lcssa
+}
+
+;CHECK-LABEL: @example23(
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example24(
+;CHECK: shufflevector <4 x i16>
+;CHECK: ret void
+define void @example24(i16 signext %x, i16 signext %y) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float], [1024 x float]* @fa, i64 0, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float], [1024 x float]* @fb, i64 0, i64 %indvars.iv
+  %5 = load float, float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %x.y = select i1 %6, i16 %x, i16 %y
+  %7 = sext i16 %x.y to i32
+  %8 = getelementptr inbounds [1024 x i32], [1024 x i32]* @ic, i64 0, i64 %indvars.iv
+  store i32 %7, i32* %8, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @example25(
+;CHECK: and <4 x i1>
+;CHECK: zext <4 x i1>
+;CHECK: ret void
+define void @example25() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float], [1024 x float]* @da, i64 0, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float], [1024 x float]* @db, i64 0, i64 %indvars.iv
+  %5 = load float, float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %7 = getelementptr inbounds [1024 x float], [1024 x float]* @dc, i64 0, i64 %indvars.iv
+  %8 = load float, float* %7, align 4
+  %9 = getelementptr inbounds [1024 x float], [1024 x float]* @dd, i64 0, i64 %indvars.iv
+  %10 = load float, float* %9, align 4
+  %11 = fcmp olt float %8, %10
+  %12 = and i1 %6, %11
+  %13 = zext i1 %12 to i32
+  %14 = getelementptr inbounds [1024 x i32], [1024 x i32]* @dj, i64 0, i64 %indvars.iv
+  store i32 %13, i32* %14, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %15, label %1
+
+; <label>:15                                      ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
new file mode 100644
index 00000000000..e73b6eacbe1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/gep_with_bitcast.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Vectorization of loop with bitcast between GEP and load
+; Simplified source code:
+;void foo (double** __restrict__  in, bool * __restrict__ res) {
+;
+;  for (int i = 0; i < 4096; ++i)
+;    res[i] = ((unsigned long long)in[i] == 0);
+;}
+
+; CHECK-LABEL: @foo
+; CHECK: vector.body
+; CHECK:  %[[IV:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:  %[[v0:.+]] = getelementptr inbounds double*, double** %in, i64 %[[IV]]
+; CHECK:  %[[v1:.+]] = bitcast double** %[[v0]] to <4 x i64>*
+; CHECK:  %wide.load = load <4 x i64>, <4 x i64>* %[[v1]], align 8
+; CHECK:  icmp eq <4 x i64> %wide.load, zeroinitializer
+; CHECK:  br i1
+
+define void @foo(double** noalias nocapture readonly %in, double** noalias nocapture readnone %out, i8* noalias nocapture %res) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double*, double** %in, i64 %indvars.iv
+  %tmp53 = bitcast double** %arrayidx to i64*
+  %tmp54 = load i64, i64* %tmp53, align 8
+  %cmp1 = icmp eq i64 %tmp54, 0
+  %arrayidx3 = getelementptr inbounds i8, i8* %res, i64 %indvars.iv
+  %frombool = zext i1 %cmp1 to i8
+  store i8 %frombool, i8* %arrayidx3, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/global_alias.ll b/llvm/test/Transforms/LoopVectorize/global_alias.ll
new file mode 100644
index 00000000000..7333af3b925
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/global_alias.ll
@@ -0,0 +1,1077 @@
+; RUN: opt < %s -O1 -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+%struct.anon.0 = type { [100 x [100 x i32]], i32, [100 x [100 x i32]] }
+
+@Foo = common global %struct.anon zeroinitializer, align 4
+@Bar = common global %struct.anon.0 zeroinitializer, align 4
+
+@PB = external global i32*
+@PA = external global i32*
+
+
+;; === First, the tests that should always vectorize, whether statically or by adding run-time checks ===
+
+
+; /// Different objects, positive induction, constant distance
+; int noAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias01(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx2, align 4
+  ret i32 %7
+}
+
+; /// Different objects, positive induction with widening slide
+; int noAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE-10; i++)
+;     Foo.A[i] = Foo.B[i+10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias02(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias02(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 90
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %add = add nsw i32 %1, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %add
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add1 = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add1, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Different objects, positive induction with shortening slide
+; int noAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias03(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias03(i32 %a) {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %add1 = add nsw i32 %4, 10
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add1
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, positive stride, run-time check added
+; int noAlias04 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+i) = *(PB+i) + a;
+;   return *(PA+a);
+; }
+; CHECK-LABEL: define i32 @noAlias04(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+;
+; TODO: This test vectorizes (with run-time check) on real targets with -O3)
+; Check why it's not being vectorized even when forcing vectorization
+
+define i32 @noAlias04(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32*, i32** @PB, align 4
+  %2 = load i32, i32* %i, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %1, i32 %2
+  %3 = load i32, i32* %add.ptr, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32*, i32** @PA, align 4
+  %6 = load i32, i32* %i, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %5, i32 %6
+  store i32 %add, i32* %add.ptr1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32*, i32** @PA, align 4
+  %9 = load i32, i32* %a.addr, align 4
+  %add.ptr2 = getelementptr inbounds i32, i32* %8, i32 %9
+  %10 = load i32, i32* %add.ptr2, align 4
+  ret i32 %10
+}
+
+; /// Different objects, positive induction, multi-array
+; int noAlias05 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][i] = Bar.B[N][i] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias05(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias05(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %N, align 4
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 2), i32 0, i32 %2
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %1
+  %3 = load i32, i32* %arrayidx1, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %6 = load i32, i32* %N, align 4
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx2, i32 0, i32 %5
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx4 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx4, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx5, align 4
+  ret i32 %10
+}
+
+; /// Same objects, positive induction, multi-array, different sub-elements
+; int noAlias06 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][i] = Bar.A[N+1][i] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias06(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias06(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %N, align 4
+  %add = add nsw i32 %2, 1
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %add
+  %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %1
+  %3 = load i32, i32* %arrayidx1, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add2 = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %6 = load i32, i32* %N, align 4
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx3, i32 0, i32 %5
+  store i32 %add2, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx5, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx6, align 4
+  ret i32 %10
+}
+
+; /// Different objects, negative induction, constant distance
+; int noAlias07 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias07(
+; CHECK: store <4 x i32>
+; CHECK: ret
+define i32 @noAlias07(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Different objects, negative induction, shortening slide
+; int noAlias08 (int a) {
+;   int i;
+;   for (i=0; i<SIZE-10; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias08(
+; CHECK: load <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias08(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 90
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Different objects, negative induction, widening slide
+; int noAlias09 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-10] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias09(
+; CHECK: load <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias09(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 10
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, negative stride, run-time check added
+; int noAlias10 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+SIZE-i-1) = *(PB+SIZE-i-1) + a;
+;   return *(PA+a);
+; }
+; CHECK-LABEL: define i32 @noAlias10(
+; CHECK-NOT: sub {{.*}} <4 x i32>
+; CHECK: ret
+;
+; TODO: This test vectorizes (with run-time check) on real targets with -O3)
+; Check why it's not being vectorized even when forcing vectorization
+
+define i32 @noAlias10(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32*, i32** @PB, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %1, i32 100
+  %2 = load i32, i32* %i, align 4
+  %idx.neg = sub i32 0, %2
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %idx.neg
+  %add.ptr2 = getelementptr inbounds i32, i32* %add.ptr1, i32 -1
+  %3 = load i32, i32* %add.ptr2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32*, i32** @PA, align 4
+  %add.ptr3 = getelementptr inbounds i32, i32* %5, i32 100
+  %6 = load i32, i32* %i, align 4
+  %idx.neg4 = sub i32 0, %6
+  %add.ptr5 = getelementptr inbounds i32, i32* %add.ptr3, i32 %idx.neg4
+  %add.ptr6 = getelementptr inbounds i32, i32* %add.ptr5, i32 -1
+  store i32 %add, i32* %add.ptr6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32*, i32** @PA, align 4
+  %9 = load i32, i32* %a.addr, align 4
+  %add.ptr7 = getelementptr inbounds i32, i32* %8, i32 %9
+  %10 = load i32, i32* %add.ptr7, align 4
+  ret i32 %10
+}
+
+; /// Different objects, negative induction, multi-array
+; int noAlias11 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][SIZE-i-1] = Bar.B[N][SIZE-i-1] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias11(
+; CHECK: store <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias11(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %2 = load i32, i32* %N, align 4
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 2), i32 0, i32 %2
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %sub1
+  %3 = load i32, i32* %arrayidx2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %sub3 = sub nsw i32 100, %5
+  %sub4 = sub nsw i32 %sub3, 1
+  %6 = load i32, i32* %N, align 4
+  %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx5, i32 0, i32 %sub4
+  store i32 %add, i32* %arrayidx6, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx8 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx7, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx8, align 4
+  ret i32 %10
+}
+
+; /// Same objects, negative induction, multi-array, different sub-elements
+; int noAlias12 (int a) {
+;   int i, N=10;
+;   for (i=0; i<SIZE; i++)
+;     Bar.A[N][SIZE-i-1] = Bar.A[N+1][SIZE-i-1] + a;
+;   return Bar.A[N][a];
+; }
+; CHECK-LABEL: define i32 @noAlias12(
+; CHECK: store <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias12(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  %N = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 10, i32* %N, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %2 = load i32, i32* %N, align 4
+  %add = add nsw i32 %2, 1
+  %arrayidx = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %add
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx, i32 0, i32 %sub1
+  %3 = load i32, i32* %arrayidx2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add3 = add nsw i32 %3, %4
+  %5 = load i32, i32* %i, align 4
+  %sub4 = sub nsw i32 100, %5
+  %sub5 = sub nsw i32 %sub4, 1
+  %6 = load i32, i32* %N, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %6
+  %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx6, i32 0, i32 %sub5
+  store i32 %add3, i32* %arrayidx7, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32, i32* %a.addr, align 4
+  %9 = load i32, i32* %N, align 4
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* getelementptr inbounds (%struct.anon.0, %struct.anon.0* @Bar, i32 0, i32 0), i32 0, i32 %9
+  %arrayidx9 = getelementptr inbounds [100 x i32], [100 x i32]* %arrayidx8, i32 0, i32 %8
+  %10 = load i32, i32* %arrayidx9, align 4
+  ret i32 %10
+}
+
+; /// Same objects, positive induction, constant distance, just enough for vector size
+; int noAlias13 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.A[i+4] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias13(
+; CHECK: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias13(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %add = add nsw i32 %1, 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add1 = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add1, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Same objects, negative induction, constant distance, just enough for vector size
+; int noAlias14 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.A[SIZE-i-5] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @noAlias14(
+; CHECK: load <4 x i32>
+; CHECK: ret
+
+define i32 @noAlias14(i32 %a) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 5
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub2 = sub nsw i32 100, %4
+  %sub3 = sub nsw i32 %sub2, 1
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub3
+  store i32 %add, i32* %arrayidx4, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx5, align 4
+  ret i32 %7
+}
+
+
+;; === Now, the tests that we could vectorize with induction changes or run-time checks ===
+
+
+; /// Different objects, swapped induction, alias at the end
+; int mayAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mayAlias01(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mayAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Different objects, swapped induction, alias at the beginning
+; int mayAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[SIZE-i-1] = Foo.B[i] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mayAlias02(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mayAlias02(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %4
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %sub1
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; /// Pointer access, run-time check added
+; int mayAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     *(PA+i) = *(PB+SIZE-i-1) + a;
+;   return *(PA+a);
+; }
+; CHECK-LABEL: define i32 @mayAlias03(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mayAlias03(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32*, i32** @PB, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %1, i32 100
+  %2 = load i32, i32* %i, align 4
+  %idx.neg = sub i32 0, %2
+  %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i32 %idx.neg
+  %add.ptr2 = getelementptr inbounds i32, i32* %add.ptr1, i32 -1
+  %3 = load i32, i32* %add.ptr2, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %3, %4
+  %5 = load i32*, i32** @PA, align 4
+  %6 = load i32, i32* %i, align 4
+  %add.ptr3 = getelementptr inbounds i32, i32* %5, i32 %6
+  store i32 %add, i32* %add.ptr3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %7, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %8 = load i32*, i32** @PA, align 4
+  %9 = load i32, i32* %a.addr, align 4
+  %add.ptr4 = getelementptr inbounds i32, i32* %8, i32 %9
+  %10 = load i32, i32* %add.ptr4, align 4
+  ret i32 %10
+}
+
+
+;; === Finally, the tests that should only vectorize with care (or if we ignore undefined behaviour at all) ===
+
+
+; int mustAlias01 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[SIZE-i-1] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mustAlias01(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mustAlias01(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %add2 = add nsw i32 %4, 10
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add2
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx4, align 4
+  ret i32 %7
+}
+
+; int mustAlias02 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mustAlias02(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mustAlias02(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %4
+  store i32 %add, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx3, align 4
+  ret i32 %7
+}
+
+; int mustAlias03 (int a) {
+;   int i;
+;   for (i=0; i<SIZE; i++)
+;     Foo.A[i+10] = Foo.B[SIZE-i-10] + a;
+;   return Foo.A[a];
+; }
+; CHECK-LABEL: define i32 @mustAlias03(
+; CHECK-NOT: add nsw <4 x i32>
+; CHECK: ret
+
+define i32 @mustAlias03(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %sub = sub nsw i32 100, %1
+  %sub1 = sub nsw i32 %sub, 10
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 2), i32 0, i32 %sub1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = load i32, i32* %a.addr, align 4
+  %add = add nsw i32 %2, %3
+  %4 = load i32, i32* %i, align 4
+  %add2 = add nsw i32 %4, 10
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %add2
+  store i32 %add, i32* %arrayidx3, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %6 = load i32, i32* %a.addr, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32], [100 x i32]* getelementptr inbounds (%struct.anon, %struct.anon* @Foo, i32 0, i32 0), i32 0, i32 %6
+  %7 = load i32, i32* %arrayidx4, align 4
+  ret i32 %7
+}
diff --git a/llvm/test/Transforms/LoopVectorize/hints-trans.ll b/llvm/test/Transforms/LoopVectorize/hints-trans.ll
new file mode 100644
index 00000000000..aa7af3de09d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hints-trans.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instsimplify -simplifycfg < %s | FileCheck %s
+; Note: -instsimplify -simplifycfg remove the (now dead) original loop, making
+; it easy to test that the llvm.loop.unroll.disable hint is still present.
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture %b) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 1, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+; CHECK-LABEL: @foo
+; CHECK: = !{!"llvm.loop.unroll.disable"}
+
+attributes #0 = { norecurse nounwind uwtable }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.unroll.disable"}
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-loads.ll b/llvm/test/Transforms/LoopVectorize/hoist-loads.ll
new file mode 100644
index 00000000000..db4774d8ba9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hoist-loads.ll
@@ -0,0 +1,70 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@A = common global [1024 x float] zeroinitializer, align 16
+@B = common global [1024 x float] zeroinitializer, align 16
+
+; Make sure we can vectorize in the presence of hoistable conditional loads.
+; CHECK-LABEL: @hoist_cond_load(
+; CHECK: load <2 x float>
+
+define void @hoist_cond_load() {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end9 ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx2, align 4
+  %cmp3 = fcmp oeq float %0, 0.000000e+00
+  br i1 %cmp3, label %if.end9, label %if.else
+
+if.else:
+  %1 = load float, float* %arrayidx, align 4
+  br label %if.end9
+
+if.end9:
+  %tmp.0 = phi float [ %1, %if.else ], [ 0.000000e+00, %for.body ]
+  store float %tmp.0, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; However, we can't hoist loads whose address we have not seen unconditionally
+; accessed. One wide load is fine, but not the second.
+; CHECK-LABEL: @dont_hoist_cond_load(
+; CHECK: load <2 x float>
+; CHECK-NOT: load <2 x float>
+
+define void @dont_hoist_cond_load() {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end9 ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @A, i64 0, i64 %indvars.iv
+  %arrayidx2 = getelementptr inbounds [1024 x float], [1024 x float]* @B, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx2, align 4
+  %cmp3 = fcmp oeq float %0, 0.000000e+00
+  br i1 %cmp3, label %if.end9, label %if.else
+
+if.else:
+  %1 = load float, float* %arrayidx, align 4
+  br label %if.end9
+
+if.end9:
+  %tmp.0 = phi float [ %1, %if.else ], [ 0.000000e+00, %for.body ]
+  store float %tmp.0, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/i8-induction.ll b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
new file mode 100644
index 00000000000..a9e8b755f18
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/i8-induction.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S
+; RUN: opt < %s -debugify -loop-vectorize -S | FileCheck %s --check-prefix=DEBUGLOC
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global i8 0, align 1
+@b = common global i8 0, align 1
+
+define void @f() nounwind uwtable ssp {
+; Check that the induction phis and adds have debug location.
+;
+; DEBUGLOC-LABEL: vector.body:
+; DEBUGLOC:         %vec.ind = phi {{.*}}, !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGLOC:         %vec.ind.next = add {{.*}}, !dbg ![[DbgLoc]]
+
+scalar.ph:
+  store i8 0, i8* inttoptr (i64 1 to i8*), align 1
+  %0 = load i8, i8* @a, align 1
+  br label %for.body
+
+for.body:
+  %mul16 = phi i8 [ 0, %scalar.ph ], [ %mul, %for.body ]              ; <------- i8 induction var.
+  %c.015 = phi i8 [ undef, %scalar.ph ], [ %conv8, %for.body ]
+  %conv2 = sext i8 %c.015 to i32
+  %tobool = icmp ne i8 %c.015, 0
+  %.sink = select i1 %tobool, i8 %c.015, i8 %0
+  %mul = mul i8 %mul16, %.sink
+  %add = add nsw i32 %conv2, 1
+  %conv8 = trunc i32 %add to i8
+  %sext = shl i32 %add, 24
+  %phitmp14 = icmp slt i32 %sext, 268435456
+  br i1 %phitmp14, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  store i8 %mul, i8* @b, align 1
+  ret void
+}
+
+; Check that the location of the new phi comes from %c.015 = phi i8
+; DEBUGLOC:         ![[DbgLoc]] = !DILocation(line: 5
diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
new file mode 100644
index 00000000000..f8d20c32bed
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
@@ -0,0 +1,35 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: more_than_one_use
+;
+; PR30627. Check that a compare instruction with more than one use is not
+; recognized as uniform and is vectorized.
+;
+; CHECK-NOT: Found uniform instruction: %cond = icmp slt i64 %i.next, %n
+; CHECK:     vector.body
+; CHECK:       %[[I:.+]] = add nuw nsw <4 x i64> %vec.ind, <i64 1, i64 1, i64 1, i64 1>
+; CHECK:       icmp slt <4 x i64> %[[I]], %broadcast.splat
+; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @more_than_one_use(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %r = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  %tmp0 = select i1 %cond, i64 %i.next, i64 0
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %tmp0
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %r, %tmp2
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conv-crash.ll b/llvm/test/Transforms/LoopVectorize/if-conv-crash.ll
new file mode 100644
index 00000000000..d82779a30e8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-conv-crash.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define fastcc void @DD_dump() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %lor.lhs.false, label %if.end25
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %if.end21, label %if.else
+
+if.else:                                          ; preds = %lor.lhs.false
+  br i1 undef, label %num_q.exit, label %while.body.i.preheader
+
+while.body.i.preheader:                           ; preds = %if.else
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %if.end.i, %while.body.i.preheader
+  switch i8 undef, label %if.end.i [
+    i8 39, label %if.then.i
+    i8 92, label %if.then.i
+  ]
+
+if.then.i:                                        ; preds = %while.body.i, %while.body.i
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %while.body.i
+  br i1 undef, label %num_q.exit, label %while.body.i
+
+num_q.exit:                                       ; preds = %if.end.i, %if.else
+  unreachable
+
+if.end21:                                         ; preds = %lor.lhs.false
+  unreachable
+
+if.end25:                                         ; preds = %entry
+  ret void
+}
+
+; PR15990
+; We can have basic blocks with single entry PHI nodes.
+define void @single_entry_phi(i32* %a, i32 *%b) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %inc10 = phi i32 [ 0, %entry ], [ %inc, %for.end ]
+  br label %for.end
+
+for.end:
+  %malicious.phi = phi i32 [ 0, %for.cond1.preheader ]
+  %inc = add nsw i32 %inc10, 1
+  %tobool = icmp eq i32 %inc, 0
+  br i1 %tobool, label %for.cond.for.end5, label %for.cond1.preheader
+
+for.cond.for.end5:
+  %and.lcssa = phi i32 [ %malicious.phi, %for.end ]
+  store i32 %and.lcssa, i32* %a, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
new file mode 100644
index 00000000000..83386a0d688
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-edgemasks.ll
@@ -0,0 +1,245 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = global i32* null, align 8
+@b = global i32* null, align 8
+@c = global i32* null, align 8
+
+; Don't create an exponetial IR for the edge masks needed when if-converting
+; this code.
+
+; PR16472
+
+; CHECK-NOT: %6000000 =
+
+define void @_Z3fn4i(i32 %p1) {
+entry:
+  %cmp88 = icmp sgt i32 %p1, 0
+  br i1 %cmp88, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load i32*, i32** @b, align 8
+  %1 = load i32*, i32** @a, align 8
+  %2 = load i32*, i32** @c, align 8
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %_ZL3fn3ii.exit58 ]
+  %arrayidx = getelementptr inbounds i32, i32* %0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx, align 4  %4 = trunc i64 %indvars.iv to i32
+  %and.i = and i32 %4, 1
+  %tobool.i.i = icmp eq i32 %and.i, 0
+  br i1 %tobool.i.i, label %if.end.i, label %if.then.i
+
+if.then.i:
+  %and.i.i = lshr i32 %3, 2
+  %and.lobit.i.i = and i32 %and.i.i, 1
+  %5 = xor i32 %and.lobit.i.i, 1
+  %or.i.i = or i32 %5, %3
+  %cmp.i = icmp sgt i32 %or.i.i, 0
+  %conv.i = zext i1 %cmp.i to i32
+  br label %if.end.i
+
+if.end.i:
+  %tobool.i87 = phi i1 [ true, %if.then.i ], [ false, %for.body ]
+  %p1.addr.0.i = phi i32 [ %conv.i, %if.then.i ], [ %3, %for.body ]
+  %6 = trunc i64 %indvars.iv to i32
+  %and1.i = and i32 %6, 7
+  %tobool2.i = icmp eq i32 %and1.i, 0
+  br i1 %tobool2.i, label %if.end7.i, label %if.then3.i
+
+if.then3.i:
+  %p1.addr.0.lobit.i = lshr i32 %p1.addr.0.i, 31
+  %and6.i = and i32 %p1.addr.0.i, 1
+  %or.i = or i32 %p1.addr.0.lobit.i, %and6.i
+  br label %if.end7.i
+
+if.end7.i:
+  %p1.addr.1.i = phi i32 [ %or.i, %if.then3.i ], [ %p1.addr.0.i, %if.end.i ]
+  br i1 %tobool.i87, label %if.then10.i, label %if.end13.i
+
+if.then10.i:
+  %cmp11.i = icmp sgt i32 %p1.addr.1.i, 0
+  %conv12.i = zext i1 %cmp11.i to i32
+  br label %if.end13.i
+
+if.end13.i:
+  %p1.addr.2.i = phi i32 [ %conv12.i, %if.then10.i ], [ %p1.addr.1.i, %if.end7.i ]
+  br i1 %tobool.i.i, label %_Z3fn2iii.exit, label %if.then16.i
+
+if.then16.i:
+  %and17.i = lshr i32 %p1.addr.2.i, 3
+  %and17.lobit.i = and i32 %and17.i, 1
+  br label %_Z3fn2iii.exit
+
+_Z3fn2iii.exit:
+  %p1.addr.3.i = phi i32 [ %and17.lobit.i, %if.then16.i ], [ %p1.addr.2.i, %if.end13.i ]
+  %7 = trunc i64 %indvars.iv to i32
+  %shr.i = ashr i32 %7, 1
+  %and.i18.i = and i32 %shr.i, 1
+  %tobool.i19.i = icmp ne i32 %and.i18.i, 0
+  br i1 %tobool.i19.i, label %if.then.i20.i, label %if.end.i.i
+
+if.then.i20.i:
+  %cmp.i.i = icmp sgt i32 %p1.addr.3.i, 0
+  %conv.i.i = zext i1 %cmp.i.i to i32
+  br label %if.end.i.i
+
+if.end.i.i:
+  %p1.addr.0.i21.i = phi i32 [ %conv.i.i, %if.then.i20.i ], [ %p1.addr.3.i, %_Z3fn2iii.exit ]
+  %and1.i.i = and i32 %shr.i, 7
+  %tobool2.i.i = icmp eq i32 %and1.i.i, 0
+  br i1 %tobool2.i.i, label %if.end7.i.i, label %if.then3.i.i
+
+if.then3.i.i:
+  %p1.addr.0.lobit.i.i = lshr i32 %p1.addr.0.i21.i, 31
+  %and6.i.i = and i32 %p1.addr.0.i21.i, 1
+  %or.i22.i = or i32 %p1.addr.0.lobit.i.i, %and6.i.i
+  br label %if.end7.i.i
+
+if.end7.i.i:
+  %p1.addr.1.i.i = phi i32 [ %or.i22.i, %if.then3.i.i ], [ %p1.addr.0.i21.i, %if.end.i.i ]
+  br i1 %tobool.i19.i, label %if.then10.i.i, label %if.end13.i.i
+
+if.then10.i.i:
+  %cmp11.i.i = icmp sgt i32 %p1.addr.1.i.i, 0
+  %conv12.i.i = zext i1 %cmp11.i.i to i32
+  br label %if.end13.i.i
+
+if.end13.i.i:
+  %p1.addr.2.i.i = phi i32 [ %conv12.i.i, %if.then10.i.i ], [ %p1.addr.1.i.i, %if.end7.i.i ]
+  %and14.i.i = and i32 %shr.i, 5
+  %tobool15.i.i = icmp eq i32 %and14.i.i, 0
+  br i1 %tobool15.i.i, label %_Z3fn2iii.exit.i, label %if.then16.i.i
+
+if.then16.i.i:
+  %and17.i.i = lshr i32 %p1.addr.2.i.i, 3
+  %and17.lobit.i.i = and i32 %and17.i.i, 1
+  br label %_Z3fn2iii.exit.i
+
+_Z3fn2iii.exit.i:
+  %p1.addr.3.i.i = phi i32 [ %and17.lobit.i.i, %if.then16.i.i ], [ %p1.addr.2.i.i, %if.end13.i.i ]
+  %8 = trunc i64 %indvars.iv to i32
+  %tobool.i11.i = icmp eq i32 %8, 0
+  br i1 %tobool.i11.i, label %_ZL3fn3ii.exit, label %if.then.i15.i
+
+if.then.i15.i:
+  %and.i12.i = lshr i32 %p1.addr.3.i.i, 2
+  %and.lobit.i13.i = and i32 %and.i12.i, 1
+  %9 = xor i32 %and.lobit.i13.i, 1
+  %or.i14.i = or i32 %9, %p1.addr.3.i.i
+  br label %_ZL3fn3ii.exit
+
+_ZL3fn3ii.exit:
+  %p1.addr.0.i16.i = phi i32 [ %or.i14.i, %if.then.i15.i ], [ %p1.addr.3.i.i, %_Z3fn2iii.exit.i ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv
+  store i32 %p1.addr.0.i16.i, i32* %arrayidx2, align 4  %arrayidx4 = getelementptr inbounds i32, i32* %0, i64 %indvars.iv
+  %10 = load i32, i32* %arrayidx4, align 4  br i1 %tobool.i.i, label %_Z3fn1ii.exit.i26, label %if.then.i.i21
+
+if.then.i.i21:
+  %and.i.i18 = lshr i32 %10, 2
+  %and.lobit.i.i19 = and i32 %and.i.i18, 1
+  %11 = xor i32 %and.lobit.i.i19, 1
+  %or.i.i20 = or i32 %11, %10
+  br label %_Z3fn1ii.exit.i26
+
+_Z3fn1ii.exit.i26:
+  %p1.addr.0.i.i22 = phi i32 [ %or.i.i20, %if.then.i.i21 ], [ %10, %_ZL3fn3ii.exit ]
+  br i1 %tobool.i87, label %if.then.i63, label %if.end.i67
+
+if.then.i63:
+  %cmp.i61 = icmp sgt i32 %p1.addr.0.i.i22, 0
+  %conv.i62 = zext i1 %cmp.i61 to i32
+  br label %if.end.i67
+
+if.end.i67:
+  %p1.addr.0.i64 = phi i32 [ %conv.i62, %if.then.i63 ], [ %p1.addr.0.i.i22, %_Z3fn1ii.exit.i26 ]
+  br i1 %tobool2.i, label %if.end7.i73, label %if.then3.i71
+
+if.then3.i71:
+  %p1.addr.0.lobit.i68 = lshr i32 %p1.addr.0.i64, 31
+  %and6.i69 = and i32 %p1.addr.0.i64, 1
+  %or.i70 = or i32 %p1.addr.0.lobit.i68, %and6.i69
+  br label %if.end7.i73
+
+if.end7.i73:
+  %p1.addr.1.i72 = phi i32 [ %or.i70, %if.then3.i71 ], [ %p1.addr.0.i64, %if.end.i67 ]
+  br i1 %tobool.i87, label %if.then10.i76, label %if.end13.i80
+
+if.then10.i76:
+  %cmp11.i74 = icmp sgt i32 %p1.addr.1.i72, 0
+  %conv12.i75 = zext i1 %cmp11.i74 to i32
+  br label %if.end13.i80
+
+if.end13.i80:
+  %p1.addr.2.i77 = phi i32 [ %conv12.i75, %if.then10.i76 ], [ %p1.addr.1.i72, %if.end7.i73 ]
+  br i1 %tobool.i.i, label %_Z3fn2iii.exit85, label %if.then16.i83
+
+if.then16.i83:
+  %and17.i81 = lshr i32 %p1.addr.2.i77, 3
+  %and17.lobit.i82 = and i32 %and17.i81, 1
+  br label %_Z3fn2iii.exit85
+
+_Z3fn2iii.exit85:
+  %p1.addr.3.i84 = phi i32 [ %and17.lobit.i82, %if.then16.i83 ], [ %p1.addr.2.i77, %if.end13.i80 ]
+  br i1 %tobool.i19.i, label %if.then.i20.i29, label %if.end.i.i33
+
+if.then.i20.i29:
+  %cmp.i.i27 = icmp sgt i32 %p1.addr.3.i84, 0
+  %conv.i.i28 = zext i1 %cmp.i.i27 to i32
+  br label %if.end.i.i33
+
+if.end.i.i33:
+  %p1.addr.0.i21.i30 = phi i32 [ %conv.i.i28, %if.then.i20.i29 ], [ %p1.addr.3.i84, %_Z3fn2iii.exit85 ]
+  br i1 %tobool2.i.i, label %if.end7.i.i39, label %if.then3.i.i37
+
+if.then3.i.i37:
+  %p1.addr.0.lobit.i.i34 = lshr i32 %p1.addr.0.i21.i30, 31
+  %and6.i.i35 = and i32 %p1.addr.0.i21.i30, 1
+  %or.i22.i36 = or i32 %p1.addr.0.lobit.i.i34, %and6.i.i35
+  br label %if.end7.i.i39
+
+if.end7.i.i39:
+  %p1.addr.1.i.i38 = phi i32 [ %or.i22.i36, %if.then3.i.i37 ], [ %p1.addr.0.i21.i30, %if.end.i.i33 ]
+  br i1 %tobool.i19.i, label %if.then10.i.i42, label %if.end13.i.i46
+
+if.then10.i.i42:
+  %cmp11.i.i40 = icmp sgt i32 %p1.addr.1.i.i38, 0
+  %conv12.i.i41 = zext i1 %cmp11.i.i40 to i32
+  br label %if.end13.i.i46
+
+if.end13.i.i46:
+  %p1.addr.2.i.i43 = phi i32 [ %conv12.i.i41, %if.then10.i.i42 ], [ %p1.addr.1.i.i38, %if.end7.i.i39 ]
+  br i1 %tobool15.i.i, label %_Z3fn2iii.exit.i52, label %if.then16.i.i49
+
+if.then16.i.i49:
+  %and17.i.i47 = lshr i32 %p1.addr.2.i.i43, 3
+  %and17.lobit.i.i48 = and i32 %and17.i.i47, 1
+  br label %_Z3fn2iii.exit.i52
+
+_Z3fn2iii.exit.i52:
+  %p1.addr.3.i.i50 = phi i32 [ %and17.lobit.i.i48, %if.then16.i.i49 ], [ %p1.addr.2.i.i43, %if.end13.i.i46 ]
+  br i1 %tobool.i11.i, label %_ZL3fn3ii.exit58, label %if.then.i15.i56
+
+if.then.i15.i56:
+  %and.i12.i53 = lshr i32 %p1.addr.3.i.i50, 2
+  %and.lobit.i13.i54 = and i32 %and.i12.i53, 1
+  %12 = xor i32 %and.lobit.i13.i54, 1
+  %or.i14.i55 = or i32 %12, %p1.addr.3.i.i50
+  br label %_ZL3fn3ii.exit58
+
+_ZL3fn3ii.exit58:
+  %p1.addr.0.i16.i57 = phi i32 [ %or.i14.i55, %if.then.i15.i56 ], [ %p1.addr.3.i.i50, %_Z3fn2iii.exit.i52 ]
+  %arrayidx7 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv
+  store i32 %p1.addr.0.i16.i57, i32* %arrayidx7, align 4  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %p1
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
new file mode 100644
index 00000000000..f254bc81a7c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <4 x i1> [[TMP11]], [[TMP16]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP18]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP20]], 19
+; CHECK-NEXT:    br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP21]], 4
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5
+; CHECK-NEXT:    br label [[IF_END14]]
+; CHECK:       if.end14:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 9, [[FOR_BODY]] ], [ 3, [[IF_THEN]] ], [ [[DOT]], [[IF_ELSE]] ]
+; CHECK-NEXT:    store i32 [[X_0]], i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %cmp26 = icmp sgt i32 %n, 0
+  br i1 %cmp26, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end14 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  br i1 %cmp3, label %if.then, label %if.end14
+
+if.then:
+  %cmp6 = icmp sgt i32 %0, 19
+  br i1 %cmp6, label %if.end14, label %if.else
+
+if.else:
+  %cmp10 = icmp slt i32 %1, 4
+  %. = select i1 %cmp10, i32 4, i32 5
+  br label %if.end14
+
+if.end14:
+  %x.0 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %., %if.else ]  ; <------------- A PHI with 3 entries that we can still vectorize.
+  store i32 %x.0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-reduction.ll
new file mode 100644
index 00000000000..a58ef3b1e05
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-reduction.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @reduction_func(
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %sum.011 = phi i32 [ %sum.1, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 30
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %add = add i32 %sum.011, 2
+  %add4 = add i32 %add, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %sum.1 = phi i32 [ %add4, %if.then ], [ %sum.011, %for.body ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ 4, %for.inc ]
+  ret i32 %sum.0.lcssa
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion.ll b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
new file mode 100644
index 00000000000..dfc062eaada
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion.ll
@@ -0,0 +1,197 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; This is the loop in this example:
+;
+;int function0(int *a, int *b, int start, int end) {
+;
+;  for (int i=start; i<end; ++i) {
+;    unsigned k = a[i];
+;
+;    if (a[i] > b[i])   <------ notice the IF inside the loop.
+;      k = k * 5 + 3;
+;
+;    a[i] = k;  <---- K is a phi node that becomes vector-select.
+;  }
+;}
+
+;CHECK-LABEL: @function0(
+;CHECK: load <4 x i32>
+;CHECK: icmp sgt <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: ret i32
+define i32 @function0(i32* nocapture %a, i32* nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  %cmp16 = icmp slt i32 %start, %end
+  br i1 %cmp16, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = sext i32 %start to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx4, align 4
+  %cmp5 = icmp sgt i32 %1, %2
+  br i1 %cmp5, label %if.then, label %if.end
+
+if.then:
+  %mul = mul i32 %1, 5
+  %add = add i32 %mul, 3
+  br label %if.end
+
+if.end:
+  %k.0 = phi i32 [ %add, %if.then ], [ %1, %for.body ]
+  store i32 %k.0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, %end
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 undef
+}
+
+
+
+; int func(int *A, int n) {
+;   unsigned sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     if (A[i] > 30)
+;       sum += A[i] + 2;
+;
+;   return sum;
+; }
+
+;CHECK-LABEL: @reduction_func(
+;CHECK: load <4 x i32>
+;CHECK: icmp slt <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: ret i32
+define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %sum.011 = phi i32 [ %sum.1, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 30
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %add = add i32 %sum.011, 2
+  %add4 = add i32 %add, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %sum.1 = phi i32 [ %add4, %if.then ], [ %sum.011, %for.body ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %sum.1, %for.inc ]
+  ret i32 %sum.0.lcssa
+}
+
+@a = common global [1 x i32*] zeroinitializer, align 8
+@c = common global i32* null, align 8
+
+; We use to if convert this loop. This is not safe because there is a trapping
+; constant expression.
+; PR16729
+
+; CHECK-LABEL: trapping_constant_expression
+; CHECK-NOT: or <4 x i32>
+
+define i32 @trapping_constant_expression() {
+entry:
+  br label %for.body
+
+for.body:
+  %inc3 = phi i32 [ 0, %entry ], [ %inc, %cond.end ]
+  %or2 = phi i32 [ 0, %entry ], [ %or, %cond.end ]
+  br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 0), i32** @c) to i32)), %cond.false ], [ 0, %for.body ]
+  %or = or i32 %or2, %cond
+  %inc = add nsw i32 %inc3, 1
+  %cmp = icmp slt i32 %inc, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 %or
+}
+
+; Neither should we if-convert if there is an instruction operand that is a
+; trapping constant expression.
+; PR16729
+
+; CHECK-LABEL: trapping_constant_expression2
+; CHECK-NOT: or <4 x i32>
+
+define i32 @trapping_constant_expression2() {
+entry:
+  br label %for.body
+
+for.body:
+  %inc3 = phi i32 [ 0, %entry ], [ %inc, %cond.end ]
+  %or2 = phi i32 [ 0, %entry ], [ %or, %cond.end ]
+  br i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 0), i32** @c), label %cond.false, label %cond.end
+
+cond.false:
+  %cond.1 = or i32 %inc3, sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32))
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %cond.1, %cond.false ], [ %inc3, %for.body ]
+  %or = or i32 %or2, %cond
+  %inc = add nsw i32 %inc3, 1
+  %cmp = icmp slt i32 %inc, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 %or
+}
+
+; Handle PHI with single incoming value having a full mask.
+; PR34523
+
+; CHECK-LABEL: PR34523
+; CHECK: vector.body
+
+define void @PR34523() {
+bb1:
+  br label %bb2
+
+bb2:                                             ; preds = %bb4, %bb1
+  %i = phi i16 [ undef, %bb1 ], [ %_tmp2, %bb4 ]
+  br label %bb3
+
+bb3:                                             ; preds = %bb2
+  %_tmp1 = phi [1 x [1 x i32]]* [ undef, %bb2 ]
+  br label %bb4
+
+bb4:                                             ; preds = %bb3
+  %_tmp2 = add i16 %i, 1
+  %_tmp3 = icmp slt i16 %_tmp2, 2
+  br i1 %_tmp3, label %bb2, label %bb5
+
+bb5:                                             ; preds = %bb4
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
new file mode 100644
index 00000000000..fa821eadb09
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -0,0 +1,277 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+; RUN: opt -S -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NO-VF
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test predication of non-void instructions, specifically (i) that these
+; instructions permit vectorization and (ii) the creation of an insertelement
+; and a Phi node. We check the full 2-element sequence for the first
+; instruction; For the rest we'll just make sure they get predicated based
+; on the code generated for the first element.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+                  i32* nocapture %asr, i32* nocapture %aur) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK:   %[[SDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[SDEE]], label %[[CSD:[a-zA-Z0-9.]+]], label %[[ESD:[a-zA-Z0-9.]+]]
+; CHECK: [[CSD]]:
+; CHECK:   %[[SDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]]
+; CHECK:   %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0
+; CHECK:   br label %[[ESD]]
+; CHECK: [[ESD]]:
+; CHECK:   %[[SDR:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[SD1]], %[[CSD]] ]
+; CHECK:   %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1
+; CHECK:   br i1 %[[SDEEH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]]
+; CHECK: [[CSDH]]:
+; CHECK:   %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1
+; CHECK:   %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]]
+; CHECK:   %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1
+; CHECK:   br label %[[ESDH]]
+; CHECK: [[ESDH]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ %[[SDR]], %[[ESD]] ], [ %[[SD1H]], %[[CSDH]] ]
+
+; CHECK:   %[[UDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[UDEE]], label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]]
+; CHECK: [[CUD]]:
+; CHECK:   %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]]
+; CHECK:   %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0
+; CHECK:   br label %[[EUD]]
+; CHECK: [[EUD]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UD1]], %[[CUD]] ]
+
+; CHECK:   %[[SREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[SREE]], label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]]
+; CHECK: [[CSR]]:
+; CHECK:   %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]]
+; CHECK:   %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0
+; CHECK:   br label %[[ESR]]
+; CHECK: [[ESR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[SR1]], %[[CSR]] ]
+
+; CHECK:   %[[UREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0
+; CHECK:   br i1 %[[UREE]], label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]]
+; CHECK: [[CUR]]:
+; CHECK:   %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0
+; CHECK:   %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]]
+; CHECK:   %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0
+; CHECK:   br label %[[EUR]]
+; CHECK: [[EUR]]:
+; CHECK:   %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UR1]], %[[CUR]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %lud = load i32, i32* %iud, align 4
+  %lsr = load i32, i32* %isr, align 4
+  %lur = load i32, i32* %iur, align 4
+  %psd = add nsw i32 %lsd, 23
+  %pud = add nsw i32 %lud, 24
+  %psr = add nsw i32 %lsr, 25
+  %pur = add nsw i32 %lur, 26
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %rsd = sdiv i32 %psd, %lsd
+  %rud = udiv i32 %pud, %lud
+  %rsr = srem i32 %psr, %lsr
+  %rur = urem i32 %pur, %lur
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+  %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+  %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %yud.0, i32* %iud, align 4
+  store i32 %ysr.0, i32* %isr, align 4
+  store i32 %yur.0, i32* %iur, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @test_scalar2scalar(i32* nocapture %asd, i32* nocapture %bsd) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test_scalar2scalar
+; CHECK: vector.body:
+; CHECK:   br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; CHECK: [[THEN]]:
+; CHECK:   %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK:   br label %[[FI]]
+; CHECK: [[FI]]:
+; CHECK:   %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ]
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
+  %lsd.b = load i32, i32* %isd.b, align 4
+  %psd = add nsw i32 %lsd, 23
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %sd1 = sdiv i32 %psd, %lsd
+  %rsd = sdiv i32 %lsd.b, %sd1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define void @pr30172(i32* nocapture %asd, i32* nocapture %bsd) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: pr30172
+; CHECK: vector.body:
+; CHECK: %[[CMP1:.+]] = icmp slt <2 x i32> %[[VAL:.+]], <i32 100, i32 100>
+; CHECK: %[[CMP2:.+]] = icmp sge <2 x i32> %[[VAL]], <i32 200, i32 200>
+; CHECK: %[[NOT:.+]] = xor <2 x i1> %[[CMP1]], <i1 true, i1 true>
+; CHECK: %[[AND:.+]] = and <2 x i1> %[[CMP2]], %[[NOT]]
+; CHECK: %[[OR:.+]] = or <2 x i1> %[[AND]], %[[CMP1]]
+; CHECK: %[[EXTRACT:.+]] = extractelement <2 x i1> %[[OR]], i32 0
+; CHECK: br i1 %[[EXTRACT]], label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]]
+; CHECK: [[THEN]]:
+; CHECK:   %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}}
+; CHECK:   br label %[[FI]]
+; CHECK: [[FI]]:
+; CHECK:   %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ]
+
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %isd.b = getelementptr inbounds i32, i32* %bsd, i64 %indvars.iv
+  %lsd.b = load i32, i32* %isd.b, align 4
+  %psd = add nsw i32 %lsd, 23
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %check
+
+check:                                            ; preds = %for.body
+  %cmp2 = icmp sge i32 %lsd, 200
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %check, %for.body
+  %sd1 = sdiv i32 %psd, %lsd
+  %rsd = sdiv i32 %lsd.b, %sd1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %check
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %check ] 
+  store i32 %ysd.0, i32* %isd, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+
+define i32 @predicated_udiv_scalarized_operand(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: predicated_udiv_scalarized_operand
+; CHECK: vector.body:
+; CHECK:   %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T00:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK:   %[[T01:.+]] = add nsw i32 %[[T00]], %x
+; CHECK:   %[[T02:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; CHECK:   %[[T03:.+]] = udiv i32 %[[T02]], %[[T01]]
+; CHECK:   %[[T04:.+]] = insertelement <2 x i32> undef, i32 %[[T03]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T05:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T06:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK:   %[[T07:.+]] = add nsw i32 %[[T06]], %x
+; CHECK:   %[[T08:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; CHECK:   %[[T09:.+]] = udiv i32 %[[T08]], %[[T07]]
+; CHECK:   %[[T10:.+]] = insertelement <2 x i32> %[[T05]], i32 %[[T09]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i32> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+; Test predicating an instruction that feeds a vectorizable use, when unrolled
+; but not vectorized. Derived from pr34248 reproducer.
+;
+; UNROLL-NO-VF-LABEL: predicated_udiv_scalarized_operand
+; UNROLL-NO-VF: vector.body:
+; UNROLL-NO-VF:   %[[LOAD0:.+]] = load i32, i32*
+; UNROLL-NO-VF:   %[[LOAD1:.+]] = load i32, i32*
+; UNROLL-NO-VF:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; UNROLL-NO-VF: [[IF0]]:
+; UNROLL-NO-VF:   %[[ADD0:.+]] = add nsw i32 %[[LOAD0]], %x
+; UNROLL-NO-VF:   %[[DIV0:.+]] = udiv i32 %[[LOAD0]], %[[ADD0]]
+; UNROLL-NO-VF:   br label %[[CONT0]]
+; UNROLL-NO-VF: [[CONT0]]:
+; UNROLL-NO-VF:   phi i32 [ undef, %vector.body ], [ %[[DIV0]], %[[IF0]] ]
+; UNROLL-NO-VF:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; UNROLL-NO-VF: [[IF1]]:
+; UNROLL-NO-VF:   %[[ADD1:.+]] = add nsw i32 %[[LOAD1]], %x
+; UNROLL-NO-VF:   %[[DIV1:.+]] = udiv i32 %[[LOAD1]], %[[ADD1]]
+; UNROLL-NO-VF:   br label %[[CONT1]]
+; UNROLL-NO-VF: [[CONT1]]:
+; UNROLL-NO-VF:   phi i32 [ undef, %[[CONT0]] ], [ %[[DIV1]], %[[IF1]] ]
+; UNROLL-NO-VF:   br i1 {{.*}}, label %middle.block, label %vector.body
+;
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp3 = add nsw i32 %tmp2, %x
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp2, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll b/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
new file mode 100644
index 00000000000..b3e2a5425a0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-not-when-safe.ll
@@ -0,0 +1,89 @@
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test no-predication of instructions that are provably safe, e.g. dividing by
+; a non-zero constant.
+define void @test(i32* nocapture %asd, i32* nocapture %aud,
+                  i32* nocapture %asr, i32* nocapture %aur,
+                  i32* nocapture %asd0, i32* nocapture %aud0,
+                  i32* nocapture %asr0, i32* nocapture %aur0
+) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %if.end
+  ret void
+
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 11, i32 11>
+; CHECK: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 13, i32 13>
+; CHECK: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 17, i32 17>
+; CHECK: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 19, i32 19>
+; CHECK-NOT: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 0, i32 0>
+
+for.body:                                         ; preds = %if.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
+  %isd = getelementptr inbounds i32, i32* %asd, i64 %indvars.iv
+  %iud = getelementptr inbounds i32, i32* %aud, i64 %indvars.iv
+  %isr = getelementptr inbounds i32, i32* %asr, i64 %indvars.iv
+  %iur = getelementptr inbounds i32, i32* %aur, i64 %indvars.iv
+  %lsd = load i32, i32* %isd, align 4
+  %lud = load i32, i32* %iud, align 4
+  %lsr = load i32, i32* %isr, align 4
+  %lur = load i32, i32* %iur, align 4
+  %psd = add nsw i32 %lsd, 23
+  %pud = add nsw i32 %lud, 24
+  %psr = add nsw i32 %lsr, 25
+  %pur = add nsw i32 %lur, 26
+  %isd0 = getelementptr inbounds i32, i32* %asd0, i64 %indvars.iv
+  %iud0 = getelementptr inbounds i32, i32* %aud0, i64 %indvars.iv
+  %isr0 = getelementptr inbounds i32, i32* %asr0, i64 %indvars.iv
+  %iur0 = getelementptr inbounds i32, i32* %aur0, i64 %indvars.iv
+  %lsd0 = load i32, i32* %isd0, align 4
+  %lud0 = load i32, i32* %iud0, align 4
+  %lsr0 = load i32, i32* %isr0, align 4
+  %lur0 = load i32, i32* %iur0, align 4
+  %psd0 = add nsw i32 %lsd, 27
+  %pud0 = add nsw i32 %lud, 28
+  %psr0 = add nsw i32 %lsr, 29
+  %pur0 = add nsw i32 %lur, 30
+  %cmp1 = icmp slt i32 %lsd, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %rsd = sdiv i32 %psd, 11
+  %rud = udiv i32 %pud, 13
+  %rsr = srem i32 %psr, 17
+  %rur = urem i32 %pur, 19
+  %rsd0 = sdiv i32 %psd0, 0
+  %rud0 = udiv i32 %pud0, 0
+  %rsr0 = srem i32 %psr0, 0
+  %rur0 = urem i32 %pur0, 0
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  %ysd.0 = phi i32 [ %rsd, %if.then ], [ %psd, %for.body ]
+  %yud.0 = phi i32 [ %rud, %if.then ], [ %pud, %for.body ]
+  %ysr.0 = phi i32 [ %rsr, %if.then ], [ %psr, %for.body ]
+  %yur.0 = phi i32 [ %rur, %if.then ], [ %pur, %for.body ]
+  %ysd0.0 = phi i32 [ %rsd0, %if.then ], [ %psd0, %for.body ]
+  %yud0.0 = phi i32 [ %rud0, %if.then ], [ %pud0, %for.body ]
+  %ysr0.0 = phi i32 [ %rsr0, %if.then ], [ %psr0, %for.body ]
+  %yur0.0 = phi i32 [ %rur0, %if.then ], [ %pur0, %for.body ]
+  store i32 %ysd.0, i32* %isd, align 4
+  store i32 %yud.0, i32* %iud, align 4
+  store i32 %ysr.0, i32* %isr, align 4
+  store i32 %yur.0, i32* %iur, align 4
+  store i32 %ysd0.0, i32* %isd0, align 4
+  store i32 %yud0.0, i32* %iud0, align 4
+  store i32 %ysr0.0, i32* %isr0, align 4
+  store i32 %yur0.0, i32* %iur0, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
new file mode 100644
index 00000000000..61c05d3154c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -0,0 +1,178 @@
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=UNROLL
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -verify-loop-info < %s | FileCheck %s --check-prefix=UNROLL-NOSIMPLIFY
+; RUN: opt -S -vectorize-num-stores-pred=1 -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg < %s | FileCheck %s --check-prefix=VEC
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test predication of stores.
+define i32 @test(i32* nocapture %f) #0 {
+entry:
+  br label %for.body
+
+; VEC-LABEL: test
+; VEC:   %[[v0:.+]] = add i64 %index, 0
+; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
+; VEC:   %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
+; VEC:   %[[v11:.+]] = extractelement <2 x i1> %[[v8]], i32 0
+; VEC:   br i1 %[[v11]], label %[[cond:.+]], label %[[else:.+]]
+;
+; VEC: [[cond]]:
+; VEC:   %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0
+; VEC:   %[[v9a:.+]] = add nsw i32 %[[v13]], 20
+; VEC:   store i32 %[[v9a]], i32* %[[v2]], align 4
+; VEC:   br label %[[else:.+]]
+;
+; VEC: [[else]]:
+; VEC:   %[[v15:.+]] = extractelement <2 x i1> %[[v8]], i32 1
+; VEC:   br i1 %[[v15]], label %[[cond2:.+]], label %[[else2:.+]]
+;
+; VEC: [[cond2]]:
+; VEC:   %[[v17:.+]] = extractelement <2 x i32> %wide.load, i32 1
+; VEC:   %[[v9b:.+]] = add nsw i32 %[[v17]], 20
+; VEC:   %[[v1:.+]] = add i64 %index, 1
+; VEC:   %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]]
+; VEC:   store i32 %[[v9b]], i32* %[[v4]], align 4
+; VEC:   br label %[[else2:.+]]
+;
+; VEC: [[else2]]:
+
+; UNROLL-LABEL: test
+; UNROLL: vector.body:
+; UNROLL:   %[[IND:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 0
+; UNROLL:   %[[IND1:[a-zA-Z0-9]+]] = add i64 %{{.*}}, 1
+; UNROLL:   %[[v0:[a-zA-Z0-9]+]] = getelementptr inbounds i32, i32* %f, i64 %[[IND]]
+; UNROLL:   %[[v1:[a-zA-Z0-9]+]] = getelementptr inbounds i32, i32* %f, i64 %[[IND1]]
+; UNROLL:   %[[v2:[a-zA-Z0-9]+]] = load i32, i32* %[[v0]], align 4
+; UNROLL:   %[[v3:[a-zA-Z0-9]+]] = load i32, i32* %[[v1]], align 4
+; UNROLL:   %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100
+; UNROLL:   %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100
+; UNROLL:   br i1 %[[v4]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]]
+;
+; UNROLL: [[cond]]:
+; UNROLL:   %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20
+; UNROLL:   store i32 %[[v6]], i32* %[[v0]], align 4
+; UNROLL:   br label %[[else]]
+;
+; UNROLL: [[else]]:
+; UNROLL:   br i1 %[[v5]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]]
+;
+; UNROLL: [[cond2]]:
+; UNROLL:   %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20
+; UNROLL:   store i32 %[[v7]], i32* %[[v1]], align 4
+; UNROLL:   br label %[[else2]]
+;
+; UNROLL: [[else2]]:
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %f, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %0, 100
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = add nsw i32 %0, 20
+  store i32 %add, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 128
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
+
+; Track basic blocks when unrolling conditional blocks. This code used to assert
+; because we did not update the phi nodes with the proper predecessor in the
+; vectorized loop body.
+; PR18724
+
+; UNROLL-NOSIMPLIFY-LABEL: bug18724
+; UNROLL-NOSIMPLIFY: store i32
+; UNROLL-NOSIMPLIFY: store i32
+
+define void @bug18724() {
+entry:
+  br label %for.body9
+
+for.body9:
+  br i1 undef, label %for.inc26, label %for.body14
+
+for.body14:
+  %indvars.iv3 = phi i64 [ %indvars.iv.next4, %for.inc23 ], [ undef, %for.body9 ]
+  %iNewChunks.120 = phi i32 [ %iNewChunks.2, %for.inc23 ], [ undef, %for.body9 ]
+  %arrayidx16 = getelementptr inbounds [768 x i32], [768 x i32]* undef, i64 0, i64 %indvars.iv3
+  %tmp = load i32, i32* %arrayidx16, align 4
+  br i1 undef, label %if.then18, label %for.inc23
+
+if.then18:
+  store i32 2, i32* %arrayidx16, align 4
+  %inc21 = add nsw i32 %iNewChunks.120, 1
+  br label %for.inc23
+
+for.inc23:
+  %iNewChunks.2 = phi i32 [ %inc21, %if.then18 ], [ %iNewChunks.120, %for.body14 ]
+  %indvars.iv.next4 = add nsw i64 %indvars.iv3, 1
+  %tmp1 = trunc i64 %indvars.iv3 to i32
+  %cmp13 = icmp slt i32 %tmp1, 0
+  br i1 %cmp13, label %for.body14, label %for.inc26
+
+for.inc26:
+  %iNewChunks.1.lcssa = phi i32 [ undef, %for.body9 ], [ %iNewChunks.2, %for.inc23 ]
+  unreachable
+}
+
+; VEC-LABEL: @minimal_bit_widths(
+;
+; In the test below, it's more profitable for the expression feeding the
+; conditional store to remain scalar. Since we can only type-shrink vector
+; types, we shouldn't try to represent the expression in a smaller type.
+;
+; VEC: vector.body:
+; VEC:   %wide.load = load <2 x i8>, <2 x i8>* {{.*}}, align 1
+; VEC:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; VEC: [[IF0]]:
+; VEC:   %[[E0:.+]] = extractelement <2 x i8> %wide.load, i32 0
+; VEC:   %[[Z0:.+]] = zext i8 %[[E0]] to i32
+; VEC:   %[[T0:.+]] = trunc i32 %[[Z0]] to i8
+; VEC:   store i8 %[[T0]], i8* {{.*}}, align 1
+; VEC:   br label %[[CONT0]]
+; VEC: [[CONT0]]:
+; VEC:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; VEC: [[IF1]]:
+; VEC:   %[[E1:.+]] = extractelement <2 x i8> %wide.load, i32 1
+; VEC:   %[[Z1:.+]] = zext i8 %[[E1]] to i32
+; VEC:   %[[T1:.+]] = trunc i32 %[[Z1]] to i8
+; VEC:   store i8 %[[T1]], i8* {{.*}}, align 1
+; VEC:   br label %[[CONT1]]
+; VEC: [[CONT1]]:
+; VEC:   br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @minimal_bit_widths(i1 %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
+  %tmp1 = phi i64 [ %tmp7, %for.inc ], [ undef, %entry ]
+  %tmp2 = getelementptr i8, i8* undef, i64 %tmp0
+  %tmp3 = load i8, i8* %tmp2, align 1
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp4 = zext i8 %tmp3 to i32
+  %tmp5 = trunc i32 %tmp4 to i8
+  store i8 %tmp5, i8* %tmp2, align 1
+  br label %for.inc
+
+for.inc:
+  %tmp6 = add nuw nsw i64 %tmp0, 1
+  %tmp7 = add i64 %tmp1, -1
+  %tmp8 = icmp eq i64 %tmp7, 0
+  br i1 %tmp8, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
new file mode 100644
index 00000000000..e3ce3ef7c9b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll
@@ -0,0 +1,821 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fadd_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fadd_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fadd_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fadd_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; float fcmp_val_fadd_select1(float * restrict x, float y, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x float> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_val_fadd_select1(float* noalias %x, float %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, %y
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and a floating-point
+;   value.
+;
+; double fcmp_val_fadd_select2(double * restrict x, double y, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y)
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_val_fadd_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %broadcast.splat2
+; CHECK: %[[V3:.*]] = fadd fast <4 x double> %[[V0]], %[[V2:.*]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_val_fadd_select2(double* noalias %x, double %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %header, %for.body
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, %y
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; float fcmp_array_elm_fadd_select1(float * restrict x, float * restrict y,
+;                                   const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select1(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x float> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x float> %[[V4]], <4 x float> %[[V3]]
+define float @fcmp_array_elm_fadd_select1(float* noalias %x, float* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %1 = load float, float* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt float %0, %1
+  %add = fadd fast float %0, %sum.1
+  %sum.2 = select i1 %cmp.2, float %add, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %2
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fadd instruction after
+;   an fcmp instruction which compares an array element and another array
+;   element.
+;
+; double fcmp_array_elm_fadd_select2(double * restrict x, double * restrict y,
+;                                    const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > y[i])
+;       sum += x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_array_elm_fadd_select2(
+; CHECK: %[[V2:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], %[[V1:.*]]
+; CHECK: %[[V4:.*]] = fadd fast <4 x double> %[[V0]], %[[V3:.*]]
+; CHECK: select <4 x i1> %[[V2]], <4 x double> %[[V4]], <4 x double> %[[V3]]
+define double @fcmp_array_elm_fadd_select2(double* noalias %x, double* noalias %y, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx.1 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %1 = load double, double* %arrayidx.2, align 4
+  %cmp.2 = fcmp fast ogt double %0, %1
+  %add = fadd fast double %0, %sum.1
+  %sum.2 = select i1 %cmp.2, double %add, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %2 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %2
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fsub_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub fast <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fsub_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %sub = fsub fast float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Float pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property.
+; float fcmp_0_fsub_select1_novectorize(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select1_novectorize(
+; CHECK-NOT: <4 x float>
+define float @fcmp_0_fsub_select1_novectorize(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %sub = fsub float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %sub, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fsub instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fsub_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fsub fast <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fsub_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %sub = fsub fast double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Double pattern:
+; Check that is not vectorized if fp-instruction has no fast-math property. 
+;
+; double fcmp_0_fsub_select2_notvectorize(double * restrict x, const int N) {
+;   double sum = 0.                                              
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum -= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fsub_select2_notvectorize(
+; CHECK-NOT: <4 x doubole>
+define double @fcmp_0_fsub_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %sub = fsub double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %sub, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; float fcmp_0_fmult_select1(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select1(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x float> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul fast <4 x float> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x float> %[[V3]], <4 x float> %[[V2]]
+define float @fcmp_0_fmult_select1(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt float %0, 0.000000e+00
+  %mult = fmul fast float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Float pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property. 
+;
+; float fcmp_0_fmult_select1_notvectorize(float * restrict x, const int N) {
+;   float sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > (float)0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select1_notvectorize(
+; CHECK-NOT: <4 x float>
+define float @fcmp_0_fmult_select1_notvectorize(float* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi float [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp.2 = fcmp ogt float %0, 0.000000e+00
+  %mult = fmul float %sum.1, %0
+  %sum.2 = select i1 %cmp.2, float %mult, float %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi float [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret float %1
+}
+
+; Double pattern:
+;   Check vectorization of reduction code which has an fmul instruction after
+;   an fcmp instruction which compares an array element and 0.
+;
+; double fcmp_0_fmult_select2(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select2(
+; CHECK: %[[V1:.*]] = fcmp fast ogt <4 x double> %[[V0:.*]], zeroinitializer
+; CHECK: %[[V3:.*]] = fmul fast <4 x double> %[[V2:.*]], %[[V0]]
+; CHECK: select <4 x i1> %[[V1]], <4 x double> %[[V3]], <4 x double> %[[V2]]
+define double @fcmp_0_fmult_select2(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp fast ogt double %0, 0.000000e+00
+  %mult = fmul fast double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Double pattern:
+;   Check that is not vectorized if fp-instruction has no fast-math property.
+;
+; double fcmp_0_fmult_select2_notvectorize(double * restrict x, const int N) {
+;   double sum = 0.
+;   for (int i = 0; i < N; ++i)
+;     if (x[i] > 0.)
+;       sum *= x[i];
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_0_fmult_select2_notvectorize(
+; CHECK-NOT: <4 x double>
+define double @fcmp_0_fmult_select2_notvectorize(double* noalias %x, i32 %N) nounwind readonly {
+entry:
+  %cmp.1 = icmp sgt i32 %N, 0
+  br i1 %cmp.1, label %for.header, label %for.end
+
+for.header:                                       ; preds = %entry
+  %zext = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.header
+  %indvars.iv = phi i64 [ 0, %for.header ], [ %indvars.iv.next, %for.body ]
+  %sum.1 = phi double [ 0.000000e+00, %for.header ], [ %sum.2, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp.2 = fcmp ogt double %0, 0.000000e+00
+  %mult = fmul double %sum.1, %0
+  %sum.2 = select i1 %cmp.2, double %mult, double %sum.1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %zext
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %1 = phi double [ 0.000000e+00, %entry ], [ %sum.2, %for.body ]
+  ret double %1
+}
+
+; Float multi pattern
+;   Check vectorisation of reduction code with a pair of selects to different
+;   fadd patterns.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum+=2*a[i];
+;     else
+;       sum+=3*a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_multi(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M1:.*]] = fmul fast <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[M2:.*]] = fmul fast <4 x float> %[[V0]], <float 2.000000e+00,
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C22]], <4 x float> %[[M1]], <4 x float> %[[M2]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C1]], <4 x float> %[[V0]], <4 x float> %[[S1]]
+; CHECK: fadd fast <4 x float> %[[S2]],
+define float @fcmp_multi(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.011 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %for.inc, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %if.else14
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, 2.000000e+00
+  br label %for.inc
+
+if.else14:                                        ; preds = %if.else
+  %mul17 = fmul fast float %0, 3.000000e+00
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.else14, %if.then10
+  %.pn = phi float [ %mul, %if.then10 ], [ %mul17, %if.else14 ], [ %0, %for.body ]
+  %sum.1 = fadd fast float %.pn, %sum.011
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fsub patterns
+;   Check vectorisation of reduction code with a pair of selects to different
+;   instructions { fadd, fsub } but equivalent (change in constant).
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum-=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fsub(
+; CHECK: %[[C1:.*]] = fcmp ogt <4 x float> %[[V0:.*]], <float 1.000000e+00,
+; CHECK: %[[C2:.*]] = fcmp olt <4 x float> %[[V0]], <float 3.000000e+00,
+; CHECK-DAG: %[[SUB:.*]] = fsub fast <4 x float>
+; CHECK-DAG: %[[ADD:.*]] = fadd fast <4 x float>
+; CHECK: %[[C11:.*]] = xor <4 x i1> %[[C1]], <i1 true,
+; CHECK-DAG: %[[C12:.*]] = and <4 x i1> %[[C2]], %[[C11]]
+; CHECK-DAG: %[[C21:.*]] = xor <4 x i1> %[[C2]], <i1 true,
+; CHECK: %[[C22:.*]] = and <4 x i1> %[[C21]], %[[C11]]
+; CHECK: %[[S1:.*]] = select <4 x i1> %[[C12]], <4 x float> %[[SUB]], <4 x float> %[[ADD]]
+; CHECK: %[[S2:.*]] = select <4 x i1> %[[C22]], {{.*}} <4 x float> %[[S1]]
+define float @fcmp_fadd_fsub(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %sub = fsub fast float %sum.010, %0
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %sub, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + fmul patterns
+;   Check lack of vectorisation of reduction code with a pair of non-compatible
+;   instructions { fadd, fmul }.
+;
+; float fcmp_multi(float *a, int n) {
+;   float sum=0.0;
+;   for (int i=0;i<n;i++) {
+;     if (a[i]>1.0)
+;       sum+=a[i];
+;     else if (a[i]<3.0)
+;       sum*=a[i];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @fcmp_fadd_fmul(
+; CHECK-NOT: <4 x float>
+define float @fcmp_fadd_fmul(float* nocapture readonly %a, i32 %n) nounwind readonly {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %sum.010 = phi float [ 0.000000e+00, %for.body.preheader ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+00
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %add = fadd fast float %0, %sum.010
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %cmp8 = fcmp olt float %0, 3.000000e+00
+  br i1 %cmp8, label %if.then10, label %for.inc
+
+if.then10:                                        ; preds = %if.else
+  %mul = fmul fast float %0, %sum.010
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then10, %if.else
+  %sum.1 = phi float [ %add, %if.then ], [ %mul, %if.then10 ], [ %sum.010, %if.else ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %sum.1, %for.inc ]
+  ret float %sum.0.lcssa
+}
+
+; Float fadd + store patterns
+;   Check lack of vectorisation of reduction code with a store back, given it
+;   has loop dependency on a[i].
+;
+; float fcmp_store_back(float a[], int LEN) {
+;     float sum = 0.0;
+;     for (int i = 0; i < LEN; i++) {
+;       sum += a[i];
+;       a[i] = sum;
+;     }
+;     return sum;
+; }
+
+; CHECK-LABEL: @fcmp_store_back(
+; CHECK-NOT: <4 x float>
+define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly {
+entry:
+  %cmp7 = icmp sgt i32 %LEN, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %LEN to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.08 = phi float [ 0.000000e+00, %for.body.preheader ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %0, %sum.08
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  ret float %sum.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
new file mode 100644
index 00000000000..798793a641b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -0,0 +1,142 @@
+; This test is based on one of benchmarks from SPEC2006. It exposes a bug with
+; incorrect updating of the dom-tree.
+; RUN: opt < %s  -loop-vectorize -verify-dom-info
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@PL_utf8skip = external constant [0 x i8]
+
+; Function Attrs: nounwind ssp uwtable
+define void @Perl_pp_quotemeta() #0 {
+  %len = alloca i64, align 8
+  br i1 undef, label %2, label %1
+
+; <label>:1                                       ; preds = %0
+  br label %3
+
+; <label>:2                                       ; preds = %0
+  br label %3
+
+; <label>:3                                       ; preds = %2, %1
+  br i1 undef, label %34, label %4
+
+; <label>:4                                       ; preds = %3
+  br i1 undef, label %5, label %6
+
+; <label>:5                                       ; preds = %4
+  br label %6
+
+; <label>:6                                       ; preds = %5, %4
+  br i1 undef, label %7, label %8
+
+; <label>:7                                       ; preds = %6
+  br label %8
+
+; <label>:8                                       ; preds = %7, %6
+  br i1 undef, label %.preheader, label %9
+
+.preheader:                                       ; preds = %9, %8
+  br i1 undef, label %.loopexit, label %.lr.ph
+
+; <label>:9                                       ; preds = %8
+  br i1 undef, label %thread-pre-split.preheader, label %.preheader
+
+thread-pre-split.preheader:                       ; preds = %9
+  br i1 undef, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+.thread-pre-split.loopexit_crit_edge:             ; preds = %19
+  %scevgep.sum = xor i64 %umax, -1
+  %scevgep45 = getelementptr i8, i8* %d.020, i64 %scevgep.sum
+  br label %thread-pre-split.loopexit
+
+thread-pre-split.loopexit:                        ; preds = %11, %.thread-pre-split.loopexit_crit_edge
+  %d.1.lcssa = phi i8* [ %scevgep45, %.thread-pre-split.loopexit_crit_edge ], [ %d.020, %11 ]
+  br i1 false, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+.lr.ph21:                                         ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
+  %d.020 = phi i8* [ undef, %26 ], [ %d.1.lcssa, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  %10 = phi i64 [ %28, %26 ], [ undef, %thread-pre-split.loopexit ], [ undef, %thread-pre-split.preheader ]
+  br i1 undef, label %11, label %22
+
+; <label>:11                                      ; preds = %.lr.ph21
+  %12 = getelementptr inbounds [0 x i8], [0 x i8]* @PL_utf8skip, i64 0, i64 undef
+  %13 = load i8, i8* %12, align 1
+  %14 = zext i8 %13 to i64
+  %15 = icmp ugt i64 %14, %10
+  %. = select i1 %15, i64 %10, i64 %14
+  br i1 undef, label %thread-pre-split.loopexit, label %.lr.ph28
+
+.lr.ph28:                                         ; preds = %11
+  %16 = xor i64 %10, -1
+  %17 = xor i64 %14, -1
+  %18 = icmp ugt i64 %16, %17
+  %umax = select i1 %18, i64 %16, i64 %17
+  br label %19
+
+; <label>:19                                      ; preds = %19, %.lr.ph28
+  %ulen.126 = phi i64 [ %., %.lr.ph28 ], [ %20, %19 ]
+  %20 = add i64 %ulen.126, -1
+  %21 = icmp eq i64 %20, 0
+  br i1 %21, label %.thread-pre-split.loopexit_crit_edge, label %19
+
+; <label>:22                                      ; preds = %.lr.ph21
+  br i1 undef, label %26, label %23
+
+; <label>:23                                      ; preds = %22
+  br i1 undef, label %26, label %24
+
+; <label>:24                                      ; preds = %23
+  br i1 undef, label %26, label %25
+
+; <label>:25                                      ; preds = %24
+  br label %26
+
+; <label>:26                                      ; preds = %25, %24, %23, %22
+  %27 = load i64, i64* %len, align 8
+  %28 = add i64 %27, -1
+  br i1 undef, label %thread-pre-split._crit_edge, label %.lr.ph21
+
+thread-pre-split._crit_edge:                      ; preds = %26, %thread-pre-split.loopexit, %thread-pre-split.preheader
+  br label %.loopexit
+
+.lr.ph:                                           ; preds = %33, %.preheader
+  br i1 undef, label %29, label %thread-pre-split5
+
+; <label>:29                                      ; preds = %.lr.ph
+  br i1 undef, label %33, label %30
+
+; <label>:30                                      ; preds = %29
+  br i1 undef, label %33, label %31
+
+thread-pre-split5:                                ; preds = %.lr.ph
+  br i1 undef, label %33, label %31
+
+; <label>:31                                      ; preds = %thread-pre-split5, %30
+  br i1 undef, label %33, label %32
+
+; <label>:32                                      ; preds = %31
+  br label %33
+
+; <label>:33                                      ; preds = %32, %31, %thread-pre-split5, %30, %29
+  br i1 undef, label %.loopexit, label %.lr.ph
+
+.loopexit:                                        ; preds = %33, %thread-pre-split._crit_edge, %.preheader
+  br label %35
+
+; <label>:34                                      ; preds = %3
+  br label %35
+
+; <label>:35                                      ; preds = %34, %.loopexit
+  br i1 undef, label %37, label %36
+
+; <label>:36                                      ; preds = %35
+  br label %37
+
+; <label>:37                                      ; preds = %36, %35
+  ret void
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.6.0 "}
diff --git a/llvm/test/Transforms/LoopVectorize/increment.ll b/llvm/test/Transforms/LoopVectorize/increment.ll
new file mode 100644
index 00000000000..46623888367
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/increment.ll
@@ -0,0 +1,65 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK-LABEL: @inc(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Can't vectorize this loop because the access to A[X] is non-linear.
+;
+;  for (i = 0; i < n; ++i) {
+;    A[B[i]]++;
+;
+;CHECK-LABEL: @histogram(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll
new file mode 100644
index 00000000000..669746aa196
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll
@@ -0,0 +1,201 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=8 -S | FileCheck %s
+
+; int int_inc;
+;
+;int induction_with_global(int init, int *restrict A, int N) {
+;  int x = init;
+;  for (int i=0;i<N;i++){
+;    A[i] = x;
+;    x += int_inc;
+;  }
+;  return x;
+;}
+
+; CHECK-LABEL: @induction_with_global(
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @int_inc, align 4
+; CHECK:       vector.ph:
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP11]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+
+@int_inc = common global i32 0, align 4
+
+define i32 @induction_with_global(i32 %init, i32* noalias nocapture %A, i32 %N) {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load i32, i32* @int_inc, align 4
+  %1 = mul i32 %0, %N
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %x.05 = phi i32 [ %init, %for.body.lr.ph ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %x.05, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %x.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  %2 = add i32 %1, %init
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %x.0.lcssa = phi i32 [ %init, %entry ], [ %2, %for.end.loopexit ]
+  ret i32 %x.0.lcssa
+}
+
+
+;int induction_with_loop_inv(int init, int *restrict A, int N, int M) {
+;  int x = init;
+;  for (int j = 0; j < M; j++) {
+;    for (int i=0; i<N; i++){
+;      A[i] = x;
+;      x += j; // induction step is a loop invariant variable
+;    }
+;  }
+;  return x;
+;}
+
+; CHECK-LABEL: @induction_with_loop_inv(
+; CHECK:       vector.ph:
+; CHECK:         [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP9]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i32 @induction_with_loop_inv(i32 %init, i32* noalias nocapture %A, i32 %N, i32 %M) {
+entry:
+  %cmp10 = icmp sgt i32 %M, 0
+  br i1 %cmp10, label %for.cond1.preheader.lr.ph, label %for.end6
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp27 = icmp sgt i32 %N, 0
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc4, %for.cond1.preheader.lr.ph
+  %indvars.iv15 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next16, %for.inc4 ]
+  %j.012 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %inc5, %for.inc4 ]
+  %x.011 = phi i32 [ %init, %for.cond1.preheader.lr.ph ], [ %x.1.lcssa, %for.inc4 ]
+  br i1 %cmp27, label %for.body3.preheader, label %for.inc4
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body3 ], [ 0, %for.body3.preheader ]
+  %x.18 = phi i32 [ %add, %for.body3 ], [ %x.011, %for.body3.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %x.18, i32* %arrayidx, align 4
+  %add = add nsw i32 %x.18, %j.012
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.inc4.loopexit, label %for.body3
+
+for.inc4.loopexit:                                ; preds = %for.body3
+  %0 = add i32 %x.011, %indvars.iv15
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.inc4.loopexit, %for.cond1.preheader
+  %x.1.lcssa = phi i32 [ %x.011, %for.cond1.preheader ], [ %0, %for.inc4.loopexit ]
+  %inc5 = add nuw nsw i32 %j.012, 1
+  %indvars.iv.next16 = add i32 %indvars.iv15, %N
+  %exitcond17 = icmp eq i32 %inc5, %M
+  br i1 %exitcond17, label %for.end6.loopexit, label %for.cond1.preheader
+
+for.end6.loopexit:                                ; preds = %for.inc4
+  %x.1.lcssa.lcssa = phi i32 [ %x.1.lcssa, %for.inc4 ]
+  br label %for.end6
+
+for.end6:                                         ; preds = %for.end6.loopexit, %entry
+  %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
+  ret i32 %x.0.lcssa
+}
+
+
+; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
+; CHECK:       vector.ph:
+; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], <8 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 8
+; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_loop_inv_trunc(i32* %a, i64 %n, i64 %step) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, %step
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
new file mode 100644
index 00000000000..6bcf03f1b6d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -0,0 +1,896 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaved-mem-accesses -instcombine -S | FileCheck %s --check-prefix=INTERLEAVE
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure that we can handle multiple integer induction variables.
+;
+; CHECK-LABEL: @multi_int_induction(
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <2 x i32> [ <i32 190, i32 191>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> %vec.ind, <2 x i32>* [[TMP6]], align 4
+; CHECK:         %index.next = add i64 %index, 2
+; CHECK-NEXT:    %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+define void @multi_int_induction(i32* %A, i32 %N) {
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %count.09 = phi i32 [ 190, %for.body.lr.ph ], [ %inc, %for.body ]
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %count.09, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %count.09, 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we remove unneeded vectorization of induction variables.
+; In order for instcombine to cleanup the vectorized induction variables that we
+; create in the loop vectorizer we need to perform some form of redundancy
+; elimination to get rid of multiple uses.
+
+; IND-LABEL: scalar_use
+
+; IND:     br label %vector.body
+; IND:     vector.body:
+;   Vectorized induction variable.
+; IND-NOT:  insertelement <2 x i64>
+; IND-NOT:  shufflevector <2 x i64>
+; IND:     br {{.*}}, label %vector.body
+
+define void @scalar_use(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ind.sum = add i64 %iv, %offset
+  %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum
+  %l1 = load float, float* %arr.idx, align 4
+  %ind.sum2 = add i64 %iv, %offset2
+  %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2
+  %l2 = load float, float* %arr.idx2, align 4
+  %m = fmul fast float %b, %l2
+  %ad = fadd fast float %l1, %m
+  store float %ad, float* %arr.idx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret void
+}
+
+; Make sure we don't create a vector induction phi node that is unused.
+; Scalarize the step vectors instead.
+;
+; for (int i = 0; i < n; ++i)
+;   sum += a[i];
+;
+; CHECK-LABEL: @scalarize_induction_variable_01(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %[[i0:.+]] = add i64 %index, 0
+; CHECK:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
+;
+; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_01(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %index, 0
+; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %index, 2
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds i64, i64* %a, i64 %[[i2]]
+;
+; IND-LABEL: @scalarize_induction_variable_01(
+; IND:     vector.body:
+; IND:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND-NOT:   add i64 {{.*}}, 2
+; IND:       getelementptr inbounds i64, i64* %a, i64 %index
+;
+; UNROLL-LABEL: @scalarize_induction_variable_01(
+; UNROLL:     vector.body:
+; UNROLL:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NOT:   add i64 {{.*}}, 4
+; UNROLL:       %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index
+; UNROLL:       getelementptr inbounds i64, i64* %[[g1]], i64 2
+
+define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %sum = phi i64 [ %2, %for.body ], [ 0, %entry ]
+  %0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %1 = load i64, i64* %0, align 8
+  %2 = add i64 %1, %sum
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3  = phi i64 [ %2, %for.body ]
+  ret i64 %3
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors.
+;
+; float s = 0;
+; for (int i ; 0; i < n; i += 8)
+;   s += (a[i] + b[i] + 1.0f);
+;
+; CHECK-LABEL: @scalarize_induction_variable_02(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %offset.idx = mul i64 %index, 8
+; CHECK:   %[[i0:.+]] = add i64 %offset.idx, 0
+; CHECK:   %[[i1:.+]] = add i64 %offset.idx, 8
+; CHECK:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; CHECK:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; CHECK:   getelementptr inbounds float, float* %b, i64 %[[i0]]
+; CHECK:   getelementptr inbounds float, float* %b, i64 %[[i1]]
+;
+; UNROLL-NO-IC-LABEL: @scalarize_induction_variable_02(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL-NO-IC:   %offset.idx = mul i64 %index, 8
+; UNROLL-NO-IC:   %[[i0:.+]] = add i64 %offset.idx, 0
+; UNROLL-NO-IC:   %[[i1:.+]] = add i64 %offset.idx, 8
+; UNROLL-NO-IC:   %[[i2:.+]] = add i64 %offset.idx, 16
+; UNROLL-NO-IC:   %[[i3:.+]] = add i64 %offset.idx, 24
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %a, i64 %[[i3]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i0]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i1]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i2]]
+; UNROLL-NO-IC:   getelementptr inbounds float, float* %b, i64 %[[i3]]
+;
+; IND-LABEL: @scalarize_induction_variable_02(
+; IND: vector.body:
+; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %[[i0:.+]] = shl i64 %index, 3
+; IND:   %[[i1:.+]] = or i64 %[[i0]], 8
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; IND:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+;
+; UNROLL-LABEL: @scalarize_induction_variable_02(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %[[i0:.+]] = shl i64 %index, 3
+; UNROLL:   %[[i1:.+]] = or i64 %[[i0]], 8
+; UNROLL:   %[[i2:.+]] = or i64 %[[i0]], 16
+; UNROLL:   %[[i3:.+]] = or i64 %[[i0]], 24
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i0]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i1]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i2]]
+; UNROLL:   getelementptr inbounds float, float* %a, i64 %[[i3]]
+
+define float @scalarize_induction_variable_02(float* %a, float* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %s = phi float [ 0.0, %entry ], [ %6, %for.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %i
+  %1 = load float, float* %0, align 4
+  %2 = getelementptr inbounds float, float* %b, i64 %i
+  %3 = load float, float* %2, align 4
+  %4 = fadd fast float %s, 1.0
+  %5 = fadd fast float %4, %1
+  %6 = fadd fast float %5, %3
+  %i.next = add nuw nsw i64 %i, 8
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %s.lcssa = phi float [ %6, %for.body ]
+  ret float %s.lcssa
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+;   a[i].f ^= y;
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_03(
+; INTERLEAVE: vector.body:
+; INTERLEAVE:   %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE:   %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE:   %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE:   %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE:   %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i7]], i32 1
+
+%pair.i32 = type { i32, i32 }
+define void @scalarize_induction_variable_03(%pair.i32 *%p, i32 %y, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %f = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %0 = load i32, i32* %f, align 8
+  %1 = xor i32 %0, %y
+  store i32 %1, i32* %f, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we scalarize the step vectors used for the pointer arithmetic. We
+; can't easily simplify vectorized step vectors. (Interleaved accesses.)
+;
+; for (int i = 0; i < n; ++i)
+;   p[i].f = a[i * 4]
+;
+; INTERLEAVE-LABEL: @scalarize_induction_variable_04(
+; INTERLEAVE: vector.body:
+; INTERLEAVE:   %[[i0:.+]] = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; INTERLEAVE:   %[[i1:.+]] = or i64 %[[i0]], 1
+; INTERLEAVE:   %[[i2:.+]] = or i64 %[[i0]], 2
+; INTERLEAVE:   %[[i3:.+]] = or i64 %[[i0]], 3
+; INTERLEAVE:   %[[i4:.+]] = or i64 %[[i0]], 4
+; INTERLEAVE:   %[[i5:.+]] = or i64 %[[i0]], 5
+; INTERLEAVE:   %[[i6:.+]] = or i64 %[[i0]], 6
+; INTERLEAVE:   %[[i7:.+]] = or i64 %[[i0]], 7
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i0]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i1]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i2]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i3]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i4]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i5]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i6]], i32 1
+; INTERLEAVE:   getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %[[i7]], i32 1
+
+define void @scalarize_induction_variable_04(i32* %a, %pair.i32* %p, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry]
+  %0 = shl nsw i64 %i, 2
+  %1 = getelementptr inbounds i32, i32* %a, i64 %0
+  %2 = load i32, i32* %1, align 1
+  %3 = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %2, i32* %3, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %4 = trunc i64 %i.next to i32
+  %cond = icmp eq i32 %4, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; PR30542. Ensure we generate all the scalar steps for the induction variable.
+; The scalar induction variable is used by a getelementptr instruction
+; (uniform), and a udiv (non-uniform).
+;
+; int sum = 0;
+; for (int i = 0; i < n; ++i) {
+;   int x = a[i];
+;   if (c)
+;     x /= i;
+;   sum += x;
+; }
+;
+; CHECK-LABEL: @scalarize_induction_variable_05(
+; CHECK: vector.body:
+; CHECK:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; CHECK:   %[[I0:.+]] = add i32 %index, 0
+; CHECK:   getelementptr inbounds i32, i32* %a, i32 %[[I0]]
+; CHECK: pred.udiv.if:
+; CHECK:   udiv i32 {{.*}}, %[[I0]]
+; CHECK: pred.udiv.if{{[0-9]+}}:
+; CHECK:   %[[I1:.+]] = add i32 %index, 1
+; CHECK:   udiv i32 {{.*}}, %[[I1]]
+;
+; UNROLL-NO_IC-LABEL: @scalarize_induction_variable_05(
+; UNROLL-NO-IC: vector.body:
+; UNROLL-NO-IC:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; UNROLL-NO-IC:   %[[I0:.+]] = add i32 %index, 0
+; UNROLL-NO-IC:   %[[I2:.+]] = add i32 %index, 2
+; UNROLL-NO-IC:   getelementptr inbounds i32, i32* %a, i32 %[[I0]]
+; UNROLL-NO-IC:   getelementptr inbounds i32, i32* %a, i32 %[[I2]]
+; UNROLL-NO-IC: pred.udiv.if:
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I0]]
+; UNROLL-NO-IC: pred.udiv.if{{[0-9]+}}:
+; UNROLL-NO-IC:   %[[I1:.+]] = add i32 %index, 1
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I1]]
+; UNROLL-NO-IC: pred.udiv.if{{[0-9]+}}:
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I2]]
+; UNROLL-NO-IC: pred.udiv.if{{[0-9]+}}:
+; UNROLL-NO-IC:   %[[I3:.+]] = add i32 %index, 3
+; UNROLL-NO-IC:   udiv i32 {{.*}}, %[[I3]]
+;
+; IND-LABEL: @scalarize_induction_variable_05(
+; IND: vector.body:
+; IND:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; IND:   %[[E0:.+]] = sext i32 %index to i64
+; IND:   getelementptr inbounds i32, i32* %a, i64 %[[E0]]
+; IND: pred.udiv.if:
+; IND:   udiv i32 {{.*}}, %index
+; IND: pred.udiv.if{{[0-9]+}}:
+; IND:   %[[I1:.+]] = or i32 %index, 1
+; IND:   udiv i32 {{.*}}, %[[I1]]
+;
+; UNROLL-LABEL: @scalarize_induction_variable_05(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue{{[0-9]+}} ]
+; UNROLL:   %[[I2:.+]] = or i32 %index, 2
+; UNROLL:   %[[E0:.+]] = sext i32 %index to i64
+; UNROLL:   %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]]
+; UNROLL:   getelementptr inbounds i32, i32* %[[G0]], i64 2
+; UNROLL: pred.udiv.if:
+; UNROLL:   udiv i32 {{.*}}, %index
+; UNROLL: pred.udiv.if{{[0-9]+}}:
+; UNROLL:   %[[I1:.+]] = or i32 %index, 1
+; UNROLL:   udiv i32 {{.*}}, %[[I1]]
+; UNROLL: pred.udiv.if{{[0-9]+}}:
+; UNROLL:   udiv i32 {{.*}}, %[[I2]]
+; UNROLL: pred.udiv.if{{[0-9]+}}:
+; UNROLL:   %[[I3:.+]] = or i32 %index, 3
+; UNROLL:   udiv i32 {{.*}}, %[[I3]]
+
+define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %if.end ]
+  %sum = phi i32 [ 0, %entry ], [ %tmp4, %if.end ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i32 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %if.end
+
+if.then:
+  %tmp2 = udiv i32 %tmp1, %i
+  br label %if.end
+
+if.end:
+  %tmp3 = phi i32 [ %tmp2, %if.then ], [ %tmp1, %for.body ]
+  %tmp4 = add i32 %tmp3, %sum
+  %i.next = add nuw nsw i32 %i, 1
+  %cond = icmp slt i32 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp5  = phi i32 [ %tmp4, %if.end ]
+  ret i32 %tmp5
+}
+
+; Ensure we generate both a vector and a scalar induction variable. In this
+; test, the induction variable is used by an instruction that will be
+; vectorized (trunc) as well as an instruction that will remain in scalar form
+; (gepelementptr).
+;
+; CHECK-LABEL: @iv_vector_and_scalar_users(
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %vec.ind = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:   %vec.ind1 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+; CHECK:   %[[i0:.+]] = add i64 %index, 0
+; CHECK:   %[[i1:.+]] = add i64 %index, 1
+; CHECK:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i0]], i32 1
+; CHECK:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; CHECK:   %index.next = add i64 %index, 2
+; CHECK:   %vec.ind.next = add <2 x i64> %vec.ind, <i64 2, i64 2>
+; CHECK:   %vec.ind.next2 = add <2 x i32> %vec.ind1, <i32 2, i32 2>
+;
+; IND-LABEL: @iv_vector_and_scalar_users(
+; IND: vector.body:
+; IND:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %vec.ind1 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+; IND:   %[[i1:.+]] = or i64 %index, 1
+; IND:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %index, i32 1
+; IND:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; IND:   %index.next = add i64 %index, 2
+; IND:   %vec.ind.next2 = add <2 x i32> %vec.ind1, <i32 2, i32 2>
+;
+; UNROLL-LABEL: @iv_vector_and_scalar_users(
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %vec.ind2 = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next5, %vector.body ]
+; UNROLL:   %[[i1:.+]] = or i64 %index, 1
+; UNROLL:   %[[i2:.+]] = or i64 %index, 2
+; UNROLL:   %[[i3:.+]] = or i64 %index, 3
+; UNROLL:   %step.add3 = add <2 x i32> %vec.ind2, <i32 2, i32 2>
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %index, i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i1]], i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i2]], i32 1
+; UNROLL:   getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %[[i3]], i32 1
+; UNROLL:   %index.next = add i64 %index, 4
+; UNROLL:   %vec.ind.next5 = add <2 x i32> %vec.ind2, <i32 4, i32 4>
+
+%pair.i16 = type { i16, i16 }
+define void @iv_vector_and_scalar_users(%pair.i16* %p, i32 %a, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %0 = trunc i64 %i to i32
+  %1 = add i32 %a, %0
+  %2 = trunc i32 %1 to i16
+  %3 = getelementptr inbounds %pair.i16, %pair.i16* %p, i64 %i, i32 1
+  store i16 %2, i16* %3, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %4 = trunc i64 %i.next to i32
+  %cond = icmp eq i32 %4, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Make sure that the loop exit count computation does not overflow for i8 and
+; i16. The exit count of these loops is i8/i16 max + 1. If we don't cast the
+; induction variable to a bigger type the exit count computation will overflow
+; to 0.
+; PR17532
+
+; CHECK-LABEL: i8_loop
+; CHECK: icmp eq i32 {{.*}}, 256
+define i32 @i8_loop() nounwind readnone ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i8 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i8 %b.0, -1
+  %4 = icmp eq i8 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; CHECK-LABEL: i16_loop
+; CHECK: icmp eq i32 {{.*}}, 65536
+
+define i32 @i16_loop() nounwind readnone ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i16 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i16 %b.0, -1
+  %4 = icmp eq i16 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; This loop has a backedge taken count of i32_max. We need to check for this
+; condition and branch directly to the scalar loop.
+
+; CHECK-LABEL: max_i32_backedgetaken
+; CHECK:  br i1 true, label %scalar.ph, label %vector.ph
+
+; CHECK: middle.block:
+; CHECK:  %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0
+; CHECK: scalar.ph:
+; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ]
+
+define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
+
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %a.0 = phi i32 [ 1, %0 ], [ %2, %1 ]
+  %b.0 = phi i32 [ 0, %0 ], [ %3, %1 ]
+  %2 = and i32 %a.0, 4
+  %3 = add i32 %b.0, -1
+  %4 = icmp eq i32 %3, 0
+  br i1 %4, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 %2
+}
+
+; When generating the overflow check we must sure that the induction start value
+; is defined before the branch to the scalar preheader.
+
+; CHECK-LABEL: testoverflowcheck
+; CHECK: entry
+; CHECK: %[[LOAD:.*]] = load i8
+; CHECK: br
+
+; CHECK: scalar.ph
+; CHECK: phi i8 [ %{{.*}}, %middle.block ], [ %[[LOAD]], %entry ]
+
+@e = global i8 1, align 1
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+define i32 @testoverflowcheck() {
+entry:
+  %.pr.i = load i8, i8* @e, align 1
+  %0 = load i32, i32* @d, align 4
+  %c.promoted.i = load i32, i32* @c, align 4
+  br label %cond.end.i
+
+cond.end.i:
+  %inc4.i = phi i8 [ %.pr.i, %entry ], [ %inc.i, %cond.end.i ]
+  %and3.i = phi i32 [ %c.promoted.i, %entry ], [ %and.i, %cond.end.i ]
+  %and.i = and i32 %0, %and3.i
+  %inc.i = add i8 %inc4.i, 1
+  %tobool.i = icmp eq i8 %inc.i, 0
+  br i1 %tobool.i, label %loopexit, label %cond.end.i
+
+loopexit:
+  ret i32 %and.i
+}
+
+; The SCEV expression of %sphi is (zext i8 {%t,+,1}<%loop> to i32)
+; In order to recognize %sphi as an induction PHI and vectorize this loop,
+; we need to convert the SCEV expression into an AddRecExpr.
+; The expression gets converted to {zext i8 %t to i32,+,1}.
+
+; CHECK-LABEL: wrappingindvars1
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.ph
+; CHECK: %[[START:.*]] = add <2 x i32> %{{.*}}, <i32 0, i32 1>
+; CHECK-LABEL: vector.body
+; CHECK: %[[PHI:.*]] = phi <2 x i32> [ %[[START]], %vector.ph ], [ %[[STEP:.*]], %vector.body ]
+; CHECK: %[[STEP]] = add <2 x i32> %[[PHI]], <i32 2, i32 2>
+define void @wrappingindvars1(i8 %t, i32 %len, i32 *%A) {
+ entry:
+  %st = zext i8 %t to i16
+  %ext = zext i8 %t to i32
+  %ecmp = icmp ult i16 %st, 42
+  br i1 %ecmp, label %loop, label %exit
+
+ loop:
+
+  %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ]
+  %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ]
+  %sphi = phi i32 [ %ext, %entry ], [%idx.inc.ext, %loop]
+
+  %ptr = getelementptr inbounds i32, i32* %A, i8 %idx
+  store i32 %sphi, i32* %ptr
+
+  %idx.inc = add i8 %idx, 1
+  %idx.inc.ext = zext i8 %idx.inc to i32
+  %idx.b.inc = add nuw nsw i32 %idx.b, 1
+
+  %c = icmp ult i32 %idx.b, %len
+  br i1 %c, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+; The SCEV expression of %sphi is (4 * (zext i8 {%t,+,1}<%loop> to i32))
+; In order to recognize %sphi as an induction PHI and vectorize this loop,
+; we need to convert the SCEV expression into an AddRecExpr.
+; The expression gets converted to ({4 * (zext %t to i32),+,4}).
+; CHECK-LABEL: wrappingindvars2
+; CHECK-LABEL: vector.scevcheck
+; CHECK-LABEL: vector.ph
+; CHECK: %[[START:.*]] = add <2 x i32> %{{.*}}, <i32 0, i32 4>
+; CHECK-LABEL: vector.body
+; CHECK: %[[PHI:.*]] = phi <2 x i32> [ %[[START]], %vector.ph ], [ %[[STEP:.*]], %vector.body ]
+; CHECK: %[[STEP]] = add <2 x i32> %[[PHI]], <i32 8, i32 8>
+define void @wrappingindvars2(i8 %t, i32 %len, i32 *%A) {
+
+entry:
+  %st = zext i8 %t to i16
+  %ext = zext i8 %t to i32
+  %ext.mul = mul i32 %ext, 4
+
+  %ecmp = icmp ult i16 %st, 42
+  br i1 %ecmp, label %loop, label %exit
+
+ loop:
+
+  %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ]
+  %sphi = phi i32 [ %ext.mul, %entry ], [%mul, %loop]
+  %idx.b = phi i32 [ 0, %entry ], [ %idx.b.inc, %loop ]
+
+  %ptr = getelementptr inbounds i32, i32* %A, i8 %idx
+  store i32 %sphi, i32* %ptr
+
+  %idx.inc = add i8 %idx, 1
+  %idx.inc.ext = zext i8 %idx.inc to i32
+  %mul = mul i32 %idx.inc.ext, 4
+  %idx.b.inc = add nuw nsw i32 %idx.b, 1
+
+  %c = icmp ult i32 %idx.b, %len
+  br i1 %c, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+; Check that we generate vectorized IVs in the pre-header
+; instead of widening the scalar IV inside the loop, when
+; we know how to do that.
+; IND-LABEL: veciv
+; IND: vector.body:
+; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; IND: %index.next = add i32 %index, 2
+; IND: %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND: %[[CMP:.*]] = icmp eq i32 %index.next
+; IND: br i1 %[[CMP]]
+; UNROLL-LABEL: veciv
+; UNROLL: vector.body:
+; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL: %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; UNROLL: %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL: %index.next = add i32 %index, 4
+; UNROLL: %vec.ind.next = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL: %[[CMP:.*]] = icmp eq i32 %index.next
+; UNROLL: br i1 %[[CMP]]
+define void @veciv(i32* nocapture %a, i32 %start, i32 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+  store i32 %indvars.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; IND-LABEL: trunciv
+; IND: vector.body:
+; IND: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND: %[[VECIND:.*]] = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %[[STEPADD:.*]], %vector.body ]
+; IND: %index.next = add i64 %index, 2
+; IND: %[[STEPADD]] = add <2 x i32> %[[VECIND]], <i32 2, i32 2>
+; IND: %[[CMP:.*]] = icmp eq i64 %index.next
+; IND: br i1 %[[CMP]]
+define void @trunciv(i32* nocapture %a, i32 %start, i64 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %trunc.iv = trunc i64 %indvars.iv to i32
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %trunc.iv
+  store i32 %trunc.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @nonprimary(
+; CHECK: vector.ph:
+; CHECK:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; CHECK:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; CHECK: vector.body:
+; CHECK:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:   %offset.idx = add i32 %i, %index
+; CHECK:   %[[A1:.*]] = add i32 %offset.idx, 0
+; CHECK:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
+; CHECK:   %[[G3:.*]] = getelementptr inbounds i32, i32* %[[G1]], i32 0
+; CHECK:   %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
+; CHECK:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; CHECK:   %index.next = add i32 %index, 2
+; CHECK:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; CHECK:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; CHECK:   br i1 %[[CMP]]
+;
+; IND-LABEL: @nonprimary(
+; IND: vector.ph:
+; IND:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; IND:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; IND:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; IND: vector.body:
+; IND:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; IND:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; IND:   %[[A1:.*]] = add i32 %index, %i
+; IND:   %[[S1:.*]] = sext i32 %[[A1]] to i64
+; IND:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]]
+; IND:   %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>*
+; IND:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; IND:   %index.next = add i32 %index, 2
+; IND:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; IND:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; IND:   br i1 %[[CMP]]
+;
+; UNROLL-LABEL: @nonprimary(
+; UNROLL: vector.ph:
+; UNROLL:   %[[INSERT:.*]] = insertelement <2 x i32> undef, i32 %i, i32 0
+; UNROLL:   %[[SPLAT:.*]] = shufflevector <2 x i32> %[[INSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; UNROLL:   %[[START:.*]] = add <2 x i32> %[[SPLAT]], <i32 0, i32 1>
+; UNROLL: vector.body:
+; UNROLL:   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; UNROLL:   %vec.ind = phi <2 x i32> [ %[[START]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; UNROLL:   %step.add = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; UNROLL:   %[[A1:.*]] = add i32 %index, %i
+; UNROLL:   %[[S1:.*]] = sext i32 %[[A1]] to i64
+; UNROLL:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]]
+; UNROLL:   %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>*
+; UNROLL:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
+; UNROLL:   %[[G2:.*]] = getelementptr inbounds i32, i32* %[[G1]], i64 2
+; UNROLL:   %[[B2:.*]] = bitcast i32* %[[G2]] to <2 x i32>*
+; UNROLL:   store <2 x i32> %step.add, <2 x i32>* %[[B2]]
+; UNROLL:   %index.next = add i32 %index, 4
+; UNROLL:   %vec.ind.next = add <2 x i32> %vec.ind, <i32 4, i32 4>
+; UNROLL:   %[[CMP:.*]] = icmp eq i32 %index.next, %n.vec
+; UNROLL:   br i1 %[[CMP]]
+define void @nonprimary(i32* nocapture %a, i32 %start, i32 %i, i32 %k) {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ %i, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %indvars.iv
+  store i32 %indvars.iv, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %k
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: @non_primary_iv_trunc(
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 2
+; CHECK:         [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+define void @non_primary_iv_trunc(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR32419. Ensure we transform truncated non-primary induction variables. In
+; the test case below we replace %tmp1 with a new induction variable. Because
+; the truncated value is non-primary, we must compute an offset from the
+; primary induction variable.
+;
+; CHECK-LABEL: @PR32419(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE4:.*]] ]
+; CHECK:         [[OFFSET_IDX:%.*]] = add i32 -20, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK:         [[TMP8:%.*]] = add i16 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i16 %b, [[TMP8]]
+; CHECK:         [[TMP15:%.*]] = add i16 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = urem i16 %b, [[TMP15]]
+; CHECK:       [[PRED_UREM_CONTINUE4]]:
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @PR32419(i32 %a, i16 %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ -20, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = phi i32 [ %a, %entry ], [ %tmp6, %for.inc ]
+  %tmp1 = trunc i32 %i to i16
+  %tmp2 = icmp eq i16 %tmp1, 0
+  br i1 %tmp2, label %for.inc, label %for.cond
+
+for.cond:
+  %tmp3 = urem i16 %b, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp4 = phi i16 [ %tmp3, %for.cond ], [ 0, %for.body ]
+  %tmp5 = sext i16 %tmp4 to i32
+  %tmp6 = or i32 %tmp0, %tmp5
+  %i.next = add nsw i32 %i, 1
+  %cond = icmp eq i32 %i.next, 0
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
+
+; Ensure that the shuffle vector for first order recurrence is inserted
+; correctly after all the phis. These new phis correspond to new IVs 
+; that are generated by optimizing non-free truncs of IVs to IVs themselves 
+define i64 @trunc_with_first_order_recurrence() {
+; CHECK-LABEL: trunc_with_first_order_recurrence
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.phi = phi <2 x i64>
+; CHECK-NEXT:    %vec.ind = phi <2 x i64> [ <i64 1, i64 2>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind2 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next3, %vector.body ]
+; CHECK-NEXT:    %vector.recur = phi <2 x i32> [ <i32 undef, i32 42>, %vector.ph ], [ %vec.ind5, %vector.body ]
+; CHECK-NEXT:    %vec.ind5 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next6, %vector.body ]
+; CHECK-NEXT:    %vec.ind7 = phi <2 x i32> [ <i32 1, i32 2>, %vector.ph ], [ %vec.ind.next8, %vector.body ]
+; CHECK-NEXT:    shufflevector <2 x i32> %vector.recur, <2 x i32> %vec.ind5, <2 x i32> <i32 1, i32 2>
+entry:
+  br label %loop
+
+exit:                                        ; preds = %loop
+  %.lcssa = phi i64 [ %c23, %loop ]
+  ret i64 %.lcssa
+
+loop:                                         ; preds = %loop, %entry
+  %c5 = phi i64 [ %c23, %loop ], [ 0, %entry ]
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 1, %entry ]
+  %x = phi i32 [ %c24, %loop ], [ 1, %entry ]
+  %y = phi i32 [ %c6, %loop ], [ 42, %entry ]
+  %c6 = trunc i64 %indvars.iv to i32
+  %c8 = mul i32 %x, %c6
+  %c9 = add i32 %c8, 42
+  %c10 = add i32 %y, %c6
+  %c11 = add i32 %c10, %c9
+  %c12 = sext i32 %c11 to i64
+  %c13 = add i64 %c5, %c12
+  %indvars.iv.tr = trunc i64 %indvars.iv to i32
+  %c14 = shl i32 %indvars.iv.tr, 1
+  %c15 = add i32 %c9, %c14
+  %c16 = sext i32 %c15 to i64
+  %c23 = add i64 %c13, %c16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %c24 = add nuw nsw i32 %x, 1
+  %exitcond.i = icmp eq i64 %indvars.iv.next, 114
+  br i1 %exitcond.i, label %exit, label %loop
+
+}
diff --git a/llvm/test/Transforms/LoopVectorize/induction_plus.ll b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
new file mode 100644
index 00000000000..b09b278cb4b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/induction_plus.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@array = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @array_at_plus_one(
+;CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK: %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+;CHECK: %vec.ind1 = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next2, %vector.body ]
+;CHECK: %[[T1:.+]] = add i64 %index, 0
+;CHECK: %[[T2:.+]] = add nsw i64 %[[T1]], 12
+;CHECK: getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 %[[T2]]
+;CHECK: %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+;CHECK: %vec.ind.next2 = add <4 x i32> %vec.ind1, <i32 4, i32 4, i32 4, i32 4>
+;CHECK: ret i32
+define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = add nsw i64 %indvars.iv, 12
+  %3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 %2
+  %4 = trunc i64 %indvars.iv to i32
+  store i32 %4, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/infiniteloop.ll b/llvm/test/Transforms/LoopVectorize/infiniteloop.ll
new file mode 100644
index 00000000000..5c5e1a3be0a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/infiniteloop.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -indvars -loop-vectorize -force-vector-width=2  < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+@a = common global i64 0, align 8
+@x = common global i32 0, align 4
+
+; We used to assert on this loop because we could not find an induction
+; variable but assumed there must be one. Scalar evolution returned a exit
+; count for the loop below and from there on we assumed that there must be an
+; induction variable. This is not a valid assumption:
+;   // getExitCount - Get the expression for the number of loop iterations for
+;   // which this loop is *guaranteed not to exit* via ExitingBlock. Otherwise
+;   // return SCEVCouldNotCompute.
+; For an infinite loop SE can return any number.
+
+; CHECK-LABEL: @fn1(
+define void @fn1()  {
+entry:
+  store i64 0, i64* @a, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %inc1 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  store volatile i32 0, i32* @x, align 4
+  %inc = add nsw i64 %inc1, 1
+  %cmp = icmp sgt i64 %inc1, -2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %inc.lcssa = phi i64 [ %inc, %for.body ]
+  store i64 %inc.lcssa, i64* @a, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/int_sideeffect.ll b/llvm/test/Transforms/LoopVectorize/int_sideeffect.ll
new file mode 100644
index 00000000000..ec72bed7c7b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/int_sideeffect.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S < %s -loop-vectorize -force-vector-width=4 | FileCheck %s
+
+declare void @llvm.sideeffect()
+
+; Vectorization across a @llvm.sideeffect.
+
+; CHECK-LABEL: store_ones
+; CHECK: store <4 x float>
+define void @store_ones(float* %p, i64 %n) nounwind {
+bb7.lr.ph:
+  br label %bb7
+
+bb7:
+  %i.02 = phi i64 [ 0, %bb7.lr.ph ], [ %tmp13, %bb7 ]
+  call void @llvm.sideeffect()
+  %tmp10 = getelementptr inbounds float, float* %p, i64 %i.02
+  store float 1.0, float* %tmp10, align 4
+  %tmp13 = add i64 %i.02, 1
+  %tmp6 = icmp ult i64 %tmp13, %n
+  br i1 %tmp6, label %bb7, label %bb14
+
+bb14:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-1.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-1.ll
new file mode 100644
index 00000000000..1d487bfb89c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-1.ll
@@ -0,0 +1,78 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the interleaved-mem-access analysis identifies the access
+; to array 'in' as interleaved, despite the possibly wrapping unsigned
+; 'out_ix' index.
+;
+; In this test the interleave-groups are full (have no gaps), so no wrapping
+; checks are necessary. We can call getPtrStride with Assume=false and
+; ShouldCheckWrap=false to safely figure out that the stride is 2.
+
+; #include <stdlib.h>
+; class Complex {
+; private:
+;  float real_;
+;  float imaginary_;
+;
+;public:
+; Complex() : real_(0), imaginary_(0) { }
+; Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { }
+; Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { }
+;
+; inline float real() const { return real_; }
+; inline float imaginary() const { return imaginary_; }
+;};
+;
+;void test(Complex * __restrict__ out, Complex * __restrict__ in, size_t out_start, size_t size)
+;{
+;   for (size_t out_offset = 0; out_offset < size; ++out_offset)
+;     {
+;       size_t out_ix = out_start + out_offset;
+;       Complex t0 = in[out_ix];
+;       out[out_ix] = t0;
+;     }
+;}
+
+; CHECK: vector.body:
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+
+%class.Complex = type { float, float }
+
+define void @_Z4testP7ComplexS0_mm(%class.Complex* noalias nocapture %out, %class.Complex* noalias nocapture readonly %in, i64 %out_start, i64 %size) local_unnamed_addr {
+entry:
+  %cmp9 = icmp eq i64 %size, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %out_offset.010 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %add = add i64 %out_offset.010, %out_start
+  %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %in, i64 %add
+  %0 = bitcast %class.Complex* %arrayidx to i32*
+  %1 = load i32, i32* %0, align 4
+  %imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %in, i64 %add, i32 1
+  %2 = bitcast float* %imaginary_.i.i to i32*
+  %3 = load i32, i32* %2, align 4
+  %arrayidx1 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add
+  %4 = bitcast %class.Complex* %arrayidx1 to i64*
+  %t0.sroa.4.0.insert.ext = zext i32 %3 to i64
+  %t0.sroa.4.0.insert.shift = shl nuw i64 %t0.sroa.4.0.insert.ext, 32
+  %t0.sroa.0.0.insert.ext = zext i32 %1 to i64
+  %t0.sroa.0.0.insert.insert = or i64 %t0.sroa.4.0.insert.shift, %t0.sroa.0.0.insert.ext
+  store i64 %t0.sroa.0.0.insert.insert, i64* %4, align 4
+  %inc = add nuw i64 %out_offset.010, 1
+  %exitcond = icmp eq i64 %inc, %size
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-2.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-2.ll
new file mode 100644
index 00000000000..667a26876a1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-2.ll
@@ -0,0 +1,58 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the interleaved-mem-access analysis currently does not create an 
+; interleave group for the access to array 'in' due to the possibly wrapping 
+; unsigned 'out_ix' index.
+;
+; In this test the interleave-group of the loads is not full (has gaps), so 
+; the wrapping checks are necessary. Here this cannot be done statically so 
+; runtime checks are needed, but with Assume=false getPtrStride cannot add 
+; runtime checks and as a result we can't create the interleave-group.
+;
+; FIXME: This is currently a missed optimization until we can use Assume=true 
+; with proper threshold checks. Once we do that the candidate interleave-group
+; will not be invalidated by the wrapping checks.
+
+; #include <stdlib.h>
+; void test(float * __restrict__ out, float * __restrict__ in, size_t size)
+; {
+;    for (size_t out_offset = 0; out_offset < size; ++out_offset)
+;      {
+;        float t0 = in[2*out_offset];
+;        out[out_offset] = t0;
+;      }
+; }
+
+; CHECK: vector.body:
+; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+define void @_Z4testPfS_m(float* noalias nocapture %out, float* noalias nocapture readonly %in, i64 %size) local_unnamed_addr {
+entry:
+  %cmp7 = icmp eq i64 %size, 0
+  br i1 %cmp7, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %out_offset.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl i64 %out_offset.08, 1
+  %arrayidx = getelementptr inbounds float, float* %in, i64 %mul
+  %0 = bitcast float* %arrayidx to i32*
+  %1 = load i32, i32* %0, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %out, i64 %out_offset.08
+  %2 = bitcast float* %arrayidx1 to i32*
+  store i32 %1, i32* %2, align 4
+  %inc = add nuw i64 %out_offset.08, 1
+  %exitcond = icmp eq i64 %inc, %size
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll
new file mode 100644
index 00000000000..7f8babea86a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the interleaved-mem-access analysis currently does not create an 
+; interleave group for access 'a' due to the possible pointer wrap-around.
+;
+; To begin with, in this test the candidate interleave group can be created 
+; only when getPtrStride is called with Assume=true. Next, because
+; the interleave-group of the loads is not full (has gaps), we also need to check 
+; for possible pointer wrapping. Here we currently use Assume=false and as a 
+; result cannot prove the transformation is safe and therefore invalidate the
+; candidate interleave group.
+;
+; FIXME: This is a missed optimization. Once we use Assume=true here, we will
+; not have to invalidate the group.
+
+; void func(unsigned * __restrict a, unsigned * __restrict b, unsigned char x, unsigned char y) {
+;  int i = 0;
+;  for (unsigned char index = x; i < y; index +=2, ++i)
+;    b[i] = a[index] * 2;
+;
+; }
+
+; CHECK: vector.body:
+; CHECK-NOT: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+define void @_Z4funcPjS_hh(i32* noalias nocapture readonly %a, i32* noalias nocapture %b, i8 zeroext %x, i8 zeroext %y) local_unnamed_addr {
+entry:
+  %cmp9 = icmp eq i8 %y, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  %wide.trip.count = zext i8 %y to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %index.011 = phi i8 [ %add, %for.body ], [ %x, %for.body.preheader ]
+  %idxprom = zext i8 %index.011 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = shl i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx2, align 4
+  %add = add i8 %index.011, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll
new file mode 100644
index 00000000000..213c30602f0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-alias.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
+
+; When merging two stores with interleaved access vectorization, make sure we
+; propagate the alias information from all scalar stores to form the most
+; generic alias info.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+%struct.Vec4r = type { double, double, double, double }
+%struct.Vec2r = type { double, double }
+
+define void @foobar(%struct.Vec4r* nocapture readonly %p, i32 %i)
+{
+entry:
+  %cp = alloca [20 x %struct.Vec2r], align 8
+  %0 = bitcast [20 x %struct.Vec2r]* %cp to i8*
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arraydecay = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 0
+  call void @g(%struct.Vec2r* nonnull %arraydecay) #4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.Vec4r, %struct.Vec4r* %p, i64 %indvars.iv, i32 0
+  %1 = load double, double* %x, align 8, !tbaa !3
+  %mul = fmul double %1, 2.000000e+00
+  %x4 = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 %indvars.iv, i32 0
+
+; The new store should alias any double rather than one of the fields of Vec2r.
+; CHECK: store <4 x double> {{.*}} !tbaa ![[STORE_TBAA:[0-9]+]]
+; CHECK-DAG: ![[DOUBLE_TBAA:[0-9]+]] = !{!"double", !{{[0-9+]}}, i64 0}
+; CHECK-DAG: ![[STORE_TBAA]] = !{![[DOUBLE_TBAA]], ![[DOUBLE_TBAA]], i64 0}
+  store double %mul, double* %x4, align 8, !tbaa !8
+  %y = getelementptr inbounds %struct.Vec4r, %struct.Vec4r* %p, i64 %indvars.iv, i32 1
+  %2 = load double, double* %y, align 8, !tbaa !10
+  %mul7 = fmul double %2, 3.000000e+00
+  %y10 = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 %indvars.iv, i32 1
+  store double %mul7, double* %y10, align 8, !tbaa !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 4
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare void @g(%struct.Vec2r*)
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 6.0.0 (trunk 319007) (llvm/trunk 319324)"}
+!3 = !{!4, !5, i64 0}
+!4 = !{!"Vec4r", !5, i64 0, !5, i64 8, !5, i64 16, !5, i64 24}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9, !5, i64 0}
+!9 = !{!"Vec2r", !5, i64 0, !5, i64 8}
+!10 = !{!4, !5, i64 8}
+!11 = !{!9, !5, i64 8}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..9ed66a22dbf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-masked-group.ll
@@ -0,0 +1,222 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; We test here that the loop-vectorizer forms an interleave-groups from 
+; predicated memory accesses only if they are both in the same (predicated)
+; block (first scenario below).
+; If the accesses are not in the same predicated block, an interleave-group
+; is not formed (scenarios 2,3 below).
+
+; Scenario 1: Check the case where it is legal to create masked interleave-
+; groups. Altogether two groups are created (one for loads and one for stores)
+; when masked-interleaved-acceses are enabled. When masked-interleaved-acceses
+; are disabled we do not create any interleave-group.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided1" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  store i8  %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with  store i8 %{{.*}}, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:   %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Inserted:  %{{.*}} = load i8, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT:     into the interleave group with   %{{.*}} = load i8, i8* %{{.*}}, align 1
+
+; Scenario 2: Check the case where it is illegal to create a masked interleave-
+; group because the first access is predicated, and the second isn't.
+; We therefore create a separate interleave-group with gaps for each of the
+; stores (if masked-interleaved-accesses are enabled) and these are later
+; invalidated because interleave-groups of stores with gaps are not supported. 
+; If masked-interleaved-accesses is not enabled we create only one interleave
+; group of stores (for the non-predicated store) and it is later invalidated
+; due to gaps.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     q[2*ix+1] = 2;
+; }
+;}
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided2" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; Scenario 3: Check the case where it is illegal to create a masked interleave-
+; group because the two accesses are in separate predicated blocks.
+; We therefore create a separate interleave-group with gaps for each of the accesses,
+; (which are later invalidated because interleave-groups of stores with gaps are 
+; not supported).
+; If masked-interleaved-accesses is not enabled we don't create any interleave
+; group because all accesses are predicated.
+;
+; void masked_strided3(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard1,
+;                     unsigned char guard2) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard1) {
+;         q[2*ix] = 1;
+;     }
+;     if (ix > guard2) {
+;         q[2*ix+1] = 2;
+;     }
+; }
+;}
+
+
+; STRIDED_UNMASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group 
+
+; STRIDED_MASKED: LV: Checking a loop in "masked_strided3" 
+; STRIDED_MASKED: LV: Analyzing interleaved accesses...
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 2, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Creating an interleave group with:  store i8 1, i8* %{{.*}}, align 1
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+; STRIDED_MASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
+
+
+; ModuleID = 'test.c'
+source_filename = "test.c"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided2(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.012 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.012, 1
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  %cmp1 = icmp ugt i32 %ix.012, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.012, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+define dso_local void @masked_strided3(i8* noalias nocapture readnone %p, i8* noalias nocapture %q, i8 zeroext %guard1, i8 zeroext %guard2) local_unnamed_addr #0 {
+entry:
+  %conv = zext i8 %guard1 to i32
+  %conv3 = zext i8 %guard2 to i32
+  br label %for.body
+
+for.body:
+  %ix.018 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %mul = shl nuw nsw i32 %ix.018, 1
+  %cmp1 = icmp ugt i32 %ix.018, %conv
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %arrayidx = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 1, i8* %arrayidx, align 1
+  br label %if.end
+
+if.end:
+  %cmp4 = icmp ugt i32 %ix.018, %conv3
+  br i1 %cmp4, label %if.then6, label %for.inc
+
+if.then6:
+  %add = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 2, i8* %arrayidx7, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.018, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = {  "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"  }
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
new file mode 100644
index 00000000000..c647f586b18
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll
@@ -0,0 +1,165 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+%pair = type { i64, i64 }
+
+; Ensure that we vectorize the interleaved load group even though the loop
+; contains a conditional store. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_0(
+;
+; CHECK: vector.ph
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_0(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't form a single interleaved group for the two loads. The
+; conditional store prevents the second load from being hoisted. The two load
+; groups are separately vectorized. The store group contains gaps and is not
+; vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_1(
+;
+; CHECK: vector.ph
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+;
+; CHECK: pred.store.continue
+; CHECK:   %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0
+; CHECK:   store i64 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <4 x i64> %[[L2]], i32 2
+; CHECK:   store i64 %[[X4]], {{.*}}
+
+define void @interleaved_with_cond_store_1(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.0, align 8
+  br label %if.merge
+
+if.merge:
+  %2 = load i64, i64* %p.0, align 8
+  store i64 %2, i64 *%p.1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Ensure that we don't create a single interleaved group for the two stores.
+; The second store is conditional and we can't sink the first store inside the
+; predicated block. The load group is vectorized, and the store groups contain
+; gaps and are not vectorized.
+;
+; CHECK-LABEL: @interleaved_with_cond_store_2(
+;
+; CHECK: vector.ph
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 1
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:.+]] = select i1 %[[IsZero]], i64 2, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+;
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}}
+; CHECK:   %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK:   store i64 %x, {{.*}}
+; CHECK:   store i64 %x, {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0
+; CHECK:   store i64 %[[X1]], {{.*}}
+;
+; CHECK: pred.store.if
+; CHECK:   %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2
+; CHECK:   store i64 %[[X2]], {{.*}}
+
+define void @interleaved_with_cond_store_2(%pair *%p, i64 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i  = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
+  %p.0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
+  %p.1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
+  %0 = load i64, i64* %p.1, align 8
+  store i64 %x, i64* %p.0, align 8
+  %1 = icmp eq i64 %0, %x
+  br i1 %1, label %if.then, label %if.merge
+
+if.then:
+  store i64 %0, i64* %p.1, align 8
+  br label %if.merge
+
+if.merge:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
new file mode 100644
index 00000000000..2d8e9afbb91
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -0,0 +1,921 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; Check vectorization on an interleaved load group of factor 2 and an interleaved
+; store group of factor 2.
+
+; int AB[1024];
+; int CD[1024];
+;  void test_array_load2_store2(int C, int D) {
+;   for (int i = 0; i < 1024; i+=2) {
+;     int A = AB[i];
+;     int B = AB[i+1];
+;     CD[i] = A + C;
+;     CD[i+1] = B * D;
+;   }
+; }
+
+; CHECK-LABEL: @test_array_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: add nsw <4 x i32>
+; CHECK: mul nsw <4 x i32>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
+
+@AB = common global [1024 x i32] zeroinitializer, align 4
+@CD = common global [1024 x i32] zeroinitializer, align 4
+
+define void @test_array_load2_store2(i32 %C, i32 %D) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx0, align 4
+  %tmp1 = or i64 %indvars.iv, 1
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
+  %tmp2 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp, %C
+  %mul = mul nsw i32 %tmp2, %D
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; int A[3072];
+; struct ST S[1024];
+; void test_struct_st3() {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X1 = *ptr++;
+;     int X2 = *ptr++;
+;     int X3 = *ptr++;
+;     T[i].x = X1 + 1;
+;     T[i].y = X2 + 2;
+;     T[i].z = X3 + 3;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_array_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
+; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
+
+%struct.ST3 = type { i32, i32, i32 }
+@A = common global [3072 x i32] zeroinitializer, align 4
+@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
+
+define void @test_struct_array_load3_store3() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
+  %tmp = load i32, i32* %ptr.016, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
+  %tmp1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
+  %tmp2 = load i32, i32* %incdec.ptr1, align 4
+  %add = add nsw i32 %tmp, 1
+  %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
+  store i32 %add3, i32* %y, align 4
+  %add6 = add nsw i32 %tmp2, 3
+  %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
+  store i32 %add6, i32* %z, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Check vectorization on an interleaved load group of factor 4.
+
+; struct ST4{
+;   int x;
+;   int y;
+;   int z;
+;   int w;
+; };
+; int test_struct_load4(struct ST4 *S) {
+;   int r = 0;
+;   for (int i = 0; i < 1024; i++) {
+;      r += S[i].x;
+;      r -= S[i].y;
+;      r += S[i].z;
+;      r -= S[i].w;
+;   }
+;   return r;
+; }
+
+; CHECK-LABEL: @test_struct_load4(
+; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub <4 x i32>
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %x, align 4
+  %add = add nsw i32 %tmp, %r.022
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
+  %tmp1 = load i32, i32* %y, align 4
+  %sub = sub i32 %add, %tmp1
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
+  %tmp2 = load i32, i32* %z, align 4
+  %add5 = add nsw i32 %sub, %tmp2
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
+  %tmp3 = load i32, i32* %w, align 4
+  %sub8 = sub i32 %add5, %tmp3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %sub8
+}
+
+; Check vectorization on an interleaved store group of factor 4.
+
+; void test_struct_store4(int *A, struct ST4 *B) {
+;   int *ptr = A;
+;   for (int i = 0; i < 1024; i++) {
+;     int X = *ptr++;
+;     B[i].x = X + 1;
+;     B[i].y = X * 2;
+;     B[i].z = X + 3;
+;     B[i].w = X + 4;
+;   }
+; }
+
+; CHECK-LABEL: @test_struct_store4(
+; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* 
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
+
+define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
+  %tmp = load i32, i32* %ptr.024, align 4
+  %add = add nsw i32 %tmp, 1
+  %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
+  store i32 %mul, i32* %y, align 4
+  %add3 = add nsw i32 %tmp, 3
+  %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
+  store i32 %add3, i32* %z, align 4
+  %add6 = add nsw i32 %tmp, 4
+  %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
+  store i32 %add6, i32* %w, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization on a reverse interleaved load group of factor 2 and
+; a reverse interleaved store group of factor 2.
+
+; struct ST2 {
+;  int x;
+;  int y;
+; };
+;
+; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = A[i].x + i;  // interleaved load of index 0
+;     int b = A[i].y - i;  // interleaved load of index 1
+;     B[i].x = a;          // interleaved store of index 0
+;     B[i].y = b;          // interleaved store of index 1
+;   }
+; }
+
+; CHECK-LABEL: @test_reversed_load2_store2(
+; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0
+; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6
+; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>*
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: add nsw <4 x i32>
+; CHECK: sub nsw <4 x i32>
+; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1
+; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7
+; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>*
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %[[B1]], align 4
+
+%struct.ST2 = type { i32, i32 }
+
+define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
+  %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %x, align 4
+  %tmp1 = trunc i64 %indvars.iv to i32
+  %add = add nsw i32 %tmp, %tmp1
+  %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
+  %tmp2 = load i32, i32* %y, align 4
+  %sub = sub nsw i32 %tmp2, %tmp1
+  %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
+  store i32 %add, i32* %x5, align 4
+  %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
+  store i32 %sub, i32* %y8, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp = icmp sgt i64 %indvars.iv, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on an interleaved load group of factor 2 with 1 gap
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
+
+; void even_load_static_tc(int *A, int *B) {
+;  for (unsigned i = 0; i < 1024; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load_static_tc(
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, 508
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
+
+define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %tmp1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on an interleaved load group of factor 2 with 1 gap
+; (missing the load of odd elements). Because the vectorized loop would
+; speculatively access memory out-of-bounds, we must execute at least one
+; iteration of the scalar loop.
+
+; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
+;  for (unsigned i = 0; i < N; i+=2)
+;     B[i/2] = A[i] * 2;
+; }
+
+; CHECK-LABEL: @even_load_dynamic_tc(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK:   %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   icmp eq i64 %index.next, %n.vec
+; CHECK: middle.block:
+; CHECK:   br i1 false, label %for.cond.cleanup, label %scalar.ph
+
+define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %mul = shl nsw i32 %tmp, 1
+  %tmp1 = lshr exact i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on a reverse interleaved load group of factor 2 with 1
+; gap and a reverse interleaved store group of factor 2. The interleaved load
+; group should be removed since it has a gap and is reverse.
+
+; struct pair {
+;  int x;
+;  int y;
+; };
+;
+; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
+;   for (int i = 1023; i >= 0; i--) {
+;     int a = X + i;
+;     int b = A[i].y - i;
+;     B[i].x = a;
+;     B[i].y = b;
+;   }
+; }
+
+; CHECK-LABEL: @load_gap_reverse(
+; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
+; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+
+%pair = type { i64, i64 }
+define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
+  %0 = add nsw i64 %X, %i
+  %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
+  %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
+  %3 = load i64, i64* %2, align 8
+  %4 = sub nsw i64 %3, %i
+  store i64 %0, i64* %1, align 8
+  store i64 %4, i64* %2, align 8
+  %i.next = add nsw i64 %i, -1
+  %cond = icmp sgt i64 %i, 0
+  br i1 %cond, label %for.body, label %for.exit
+
+for.exit:
+  ret void
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load2_store2(int *A, int *B) {
+;   for (unsigned i = 0; i < 1024; i+=2)  {
+;     B[i] = A[i] * A[i+1];
+;     B[i+1] = A[i] + A[i+1];
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load2_store2(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK: store <8 x i32> %interleaved.vec
+
+define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %tmp1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
+  %tmp2 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp2, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx4, align 4
+  %tmp3 = load i32, i32* %arrayidx, align 4
+  %tmp4 = load i32, i32* %arrayidx2, align 4
+  %add10 = add nsw i32 %tmp4, %tmp3
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
+  store i32 %add10, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %cmp = icmp ult i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+; Check vectorization on interleaved access groups identified from mixed
+; loads/stores.
+; void mixed_load3_store3(int *A) {
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     *A++ += i;
+;     *A++ += i;
+;     *A++ += i;
+;   }
+; }
+
+; CHECK-LABEL: @mixed_load3_store3(
+; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4
+
+define void @mixed_load3_store3(i32* nocapture %A) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
+  %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
+  %tmp = load i32, i32* %A.addr.012, align 4
+  %add = add i32 %tmp, %i.013
+  store i32 %add, i32* %A.addr.012, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
+  %tmp1 = load i32, i32* %incdec.ptr, align 4
+  %add2 = add i32 %tmp1, %i.013
+  store i32 %add2, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
+  %tmp2 = load i32, i32* %incdec.ptr1, align 4
+  %add4 = add i32 %tmp2, %i.013
+  store i32 %add4, i32* %incdec.ptr1, align 4
+  %inc = add nuw nsw i32 %i.013, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization on interleaved access groups with members having different
+; kinds of type.
+
+; struct IntFloat {
+;   int a;
+;   float b;
+; };
+; 
+; int SA;
+; float SB;
+;
+; void int_float_struct(struct IntFloat *A) {
+;   int SumA;
+;   float SumB;
+;   for (unsigned i = 0; i < 1024; i++)  {
+;     SumA += A[i].a;
+;     SumB += A[i].b;
+;   }
+;   SA = SumA;
+;   SB = SumB;
+; }
+
+; CHECK-LABEL: @int_float_struct(
+; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
+; CHECK: add nsw <4 x i32>
+; CHECK: fadd fast <4 x float>
+
+%struct.IntFloat = type { i32, float }
+
+@SA = common global i32 0, align 4
+@SB = common global float 0.000000e+00, align 4
+
+define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  store i32 %add, i32* @SA, align 4
+  store float %add3, float* @SB, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
+  %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
+  %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
+  %tmp = load i32, i32* %a, align 4
+  %add = add nsw i32 %tmp, %SumA.013
+  %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
+  %tmp1 = load float, float* %b, align 4
+  %add3 = fadd fast float %SumB.014, %tmp1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check vectorization of interleaved access groups in the presence of
+; dependences (PR27626). The following tests check that we don't reorder
+; dependent loads and stores when generating code for interleaved access
+; groups. Stores should be scalarized because the required code motion would
+; break dependences, and the remaining interleaved load groups should have
+; gaps.
+
+; PR27626_0: Ensure a strided store is not moved after a dependent (zero
+;            distance) strided load.
+
+; void PR27626_0(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_0(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+%pair.i32 = type { i32, i32 }
+define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_1: Ensure a strided load is not moved before a dependent (zero
+;            distance) strided store.
+
+; void PR27626_1(struct pair *p, int n) {
+;   int s = 0;
+;   for (int i = 0; i < n; i++) {
+;     p[i].y = p[i].x;
+;     s += p[i].y
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_1(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_2: Ensure a strided store is not moved after a dependent (negative
+;            distance) strided load.
+
+; void PR27626_2(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i].x = z;
+;     p[i].y = p[i - 1].x;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_2(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
+; CHECK:   store i32 %[[X1]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
+; CHECK:   store i32 %[[X2]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
+; CHECK:   store i32 %[[X3]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
+; CHECK:   store i32 %[[X4]], {{.*}}
+
+define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_minus_1 = add nuw nsw i64 %i, -1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  store i32 %z, i32* %p_i.x, align 4
+  %0 = load i32, i32* %p_i_minus_1.x, align 4
+  store i32 %0, i32 *%p_i.y, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_3: Ensure a strided load is not moved before a dependent (negative
+;            distance) strided store.
+
+; void PR27626_3(struct pair *p, int z, int n) {
+;   for (int i = 0; i < n; i++) {
+;     p[i + 1].y = p[i].x;
+;     s += p[i].y;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_3(
+; CHECK: vector.ph:
+; CHECK:   %n.mod.vf = and i64 %[[N:.+]], 3
+; CHECK:   %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
+; CHECK:   %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
+; CHECK:   %n.vec = sub nsw i64 %[[N]], %[[R]]
+; CHECK: vector.body:
+; CHECK:   %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK:   %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
+; CHECK:   store i32 %[[X1:.+]], {{.*}}
+; CHECK:   %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
+; CHECK:   store i32 %[[X2:.+]], {{.*}}
+; CHECK:   %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
+; CHECK:   store i32 %[[X3:.+]], {{.*}}
+; CHECK:   %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
+; CHECK:   store i32 %[[X4:.+]], {{.*}}
+; CHECK:   %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
+; CHECK:   %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   add nsw <4 x i32> %[[S1]], %[[Phi]]
+
+define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add nuw nsw i64 %i, 1
+  %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
+  %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
+  %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
+  %0 = load i32, i32* %p_i.x, align 4
+  store i32 %0, i32* %p_i_plus_1.y, align 4
+  %1 = load i32, i32* %p_i.y, align 4
+  %2 = add nsw i32 %1, %s
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %3 = phi i32 [ %2, %for.body ]
+  ret i32 %3
+}
+
+; PR27626_4: Ensure we form an interleaved group for strided stores in the
+;            presence of a write-after-write dependence. We create a group for
+;            (2) and (3) while excluding (1).
+
+; void PR27626_4(int *a, int x, int y, int z, int n) {
+;   for (int i = 0; i < n; i += 2) {
+;     a[i] = x;      // (1)
+;     a[i] = y;      // (2)
+;     a[i + 1] = z;  // (3)
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_4(
+; CHECK: vector.ph:
+; CHECK:   %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
+; CHECK:   %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:   %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
+; CHECK:   %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK:   store <8 x i32> %[[VEC]], {{.*}}
+
+define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %i_plus_1 = add i64 %i, 1
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
+  store i32 %x, i32* %a_i, align 4
+  store i32 %y, i32* %a_i, align 4
+  store i32 %z, i32* %a_i_plus_1, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR27626_5: Ensure we do not form an interleaved group for strided stores in
+;            the presence of a write-after-write dependence.
+
+; void PR27626_5(int *a, int x, int y, int z, int n) {
+;   for (int i = 3; i < n; i += 2) {
+;     a[i - 1] = x;
+;     a[i - 3] = y;
+;     a[i] = z;
+;   }
+; }
+
+; CHECK-LABEL: @PR27626_5(
+; CHECK: vector.body:
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %x, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %y, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+; CHECK:   store i32 %z, {{.*}}
+
+define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
+  %i_minus_1 = sub i64 %i, 1
+  %i_minus_3 = sub i64 %i_minus_1, 2
+  %a_i = getelementptr inbounds i32, i32* %a, i64 %i
+  %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
+  %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
+  store i32 %x, i32* %a_i_minus_1, align 4
+  store i32 %y, i32* %a_i_minus_3, align 4
+  store i32 %z, i32* %a_i, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR34743: Ensure that a cast which needs to sink after a load that belongs to
+; an interleaved group, indeeded gets sunk.
+
+; void PR34743(short *a, int *b, int n) {
+;   for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
+;     b[i] = a[iv] * a[iv+1] * a[iv+2];
+;   }
+; }
+
+; CHECK-LABEL: @PR34743(
+; CHECK: vector.body:
+; CHECK:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[VSHUF1:.+]], %vector.body ]
+; CHECK:   %wide.vec = load <8 x i16>
+; CHECK:   %[[VSHUF0:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:   %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK:   %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:   sext <4 x i16> %[[VSHUF0]] to <4 x i32>
+; CHECK:   sext <4 x i16> %[[VSHUF]] to <4 x i32>
+; CHECK:   sext <4 x i16> %[[VSHUF1]] to <4 x i32>
+; CHECK:   mul nsw <4 x i32>
+; CHECK:   mul nsw <4 x i32>
+
+define void @PR34743(i16* %a, i32* %b, i64 %n) {
+entry:
+  %.pre = load i16, i16* %a
+  br label %loop
+
+loop:
+  %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
+  %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
+  %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
+  %conv = sext i16 %0 to i32
+  %i1 = add nuw nsw i64 %i, 1
+  %iv1 = add nuw nsw i64 %iv, 1
+  %iv2 = add nuw nsw i64 %iv, 2
+  %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
+  %load1 = load i16, i16* %gep1, align 4
+  %conv1 = sext i16 %load1 to i32
+  %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
+  %load2 = load i16, i16* %gep2, align 4
+  %conv2 = sext i16 %load2 to i32
+  %mul01 = mul nsw i32 %conv, %conv1
+  %mul012 = mul nsw i32 %mul01, %conv2
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
+  store i32 %mul012, i32* %arrayidx5
+  %exitcond = icmp eq i64 %iv, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll b/llvm/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll
new file mode 100644
index 00000000000..434c623e056
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-acess-with-remarks.ll
@@ -0,0 +1,43 @@
+; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 --pass-remarks=loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+
+; This only tests that asking for remarks doesn't lead to compiler crashing
+; (or timing out). We just check for output. To be sure, we also check we didn't
+; vectorize.
+; CHECK-LABEL: @atomicLoadsBothWriteAndReadMem
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+%"struct.std::__atomic_base" = type { i32 }
+%"struct.std::atomic" = type { %"struct.std::__atomic_base" }
+%union.anon = type { i64 }
+%MyStruct = type { i32, %"struct.std::atomic", %union.anon }
+
+define void @atomicLoadsBothWriteAndReadMem(%MyStruct *%a, %MyStruct *%b, %MyStruct *%lim) {
+entry:
+  br label %loop
+
+loop:
+  %0 = phi %MyStruct* [ %a, %entry ], [ %ainc, %loop ]
+  %1 = phi %MyStruct* [ %b, %entry ], [ %binc, %loop ]
+  %2 = getelementptr %MyStruct, %MyStruct* %1, i64 0, i32 0
+  %3 = load i32, i32* %2, align 8
+  %4 = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 0, i32 0
+  store i32 %3, i32* %4, align 8
+  %5 = getelementptr inbounds %MyStruct, %MyStruct* %1, i64 0, i32 1, i32 0, i32 0
+  %6 = load atomic i32, i32* %5 monotonic, align 4
+  %7 = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 0, i32 1, i32 0, i32 0
+  store atomic i32 %6, i32* %7 monotonic, align 4
+  %8 = getelementptr inbounds %MyStruct, %MyStruct* %1, i64 0, i32 2, i32 0
+  %9 = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 0, i32 2, i32 0
+  %10 = load i64, i64* %8, align 8
+  store i64 %10, i64* %9, align 8
+  %binc = getelementptr inbounds %MyStruct, %MyStruct* %1, i64 1
+  %ainc = getelementptr inbounds %MyStruct, %MyStruct* %0, i64 1
+  %cond = icmp eq %MyStruct* %binc, %lim
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
new file mode 100644
index 00000000000..50cdb73ae8e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -0,0 +1,1357 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @sqrt_f32(
+;CHECK: llvm.sqrt.v4f32
+;CHECK: ret void
+define void @sqrt_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.sqrt.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float) nounwind readnone
+
+;CHECK-LABEL: @sqrt_f64(
+;CHECK: llvm.sqrt.v4f64
+;CHECK: ret void
+define void @sqrt_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.sqrt.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.sqrt.f64(double) nounwind readnone
+
+;CHECK-LABEL: @sin_f32(
+;CHECK: llvm.sin.v4f32
+;CHECK: ret void
+define void @sin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.sin.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.sin.f32(float) nounwind readnone
+
+;CHECK-LABEL: @sin_f64(
+;CHECK: llvm.sin.v4f64
+;CHECK: ret void
+define void @sin_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.sin.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.sin.f64(double) nounwind readnone
+
+;CHECK-LABEL: @cos_f32(
+;CHECK: llvm.cos.v4f32
+;CHECK: ret void
+define void @cos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.cos.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.cos.f32(float) nounwind readnone
+
+;CHECK-LABEL: @cos_f64(
+;CHECK: llvm.cos.v4f64
+;CHECK: ret void
+define void @cos_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.cos.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.cos.f64(double) nounwind readnone
+
+;CHECK-LABEL: @exp_f32(
+;CHECK: llvm.exp.v4f32
+;CHECK: ret void
+define void @exp_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.exp.f32(float) nounwind readnone
+
+;CHECK-LABEL: @exp_f64(
+;CHECK: llvm.exp.v4f64
+;CHECK: ret void
+define void @exp_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.exp.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.exp.f64(double) nounwind readnone
+
+;CHECK-LABEL: @exp2_f32(
+;CHECK: llvm.exp2.v4f32
+;CHECK: ret void
+define void @exp2_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp2.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.exp2.f32(float) nounwind readnone
+
+;CHECK-LABEL: @exp2_f64(
+;CHECK: llvm.exp2.v4f64
+;CHECK: ret void
+define void @exp2_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.exp2.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.exp2.f64(double) nounwind readnone
+
+;CHECK-LABEL: @log_f32(
+;CHECK: llvm.log.v4f32
+;CHECK: ret void
+define void @log_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.log.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.log.f32(float) nounwind readnone
+
+;CHECK-LABEL: @log_f64(
+;CHECK: llvm.log.v4f64
+;CHECK: ret void
+define void @log_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.log.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.log.f64(double) nounwind readnone
+
+;CHECK-LABEL: @log10_f32(
+;CHECK: llvm.log10.v4f32
+;CHECK: ret void
+define void @log10_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.log10.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.log10.f32(float) nounwind readnone
+
+;CHECK-LABEL: @log10_f64(
+;CHECK: llvm.log10.v4f64
+;CHECK: ret void
+define void @log10_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.log10.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.log10.f64(double) nounwind readnone
+
+;CHECK-LABEL: @log2_f32(
+;CHECK: llvm.log2.v4f32
+;CHECK: ret void
+define void @log2_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.log2.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.log2.f32(float) nounwind readnone
+
+;CHECK-LABEL: @log2_f64(
+;CHECK: llvm.log2.v4f64
+;CHECK: ret void
+define void @log2_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.log2.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.log2.f64(double) nounwind readnone
+
+;CHECK-LABEL: @fabs_f32(
+;CHECK: llvm.fabs.v4f32
+;CHECK: ret void
+define void @fabs_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.fabs.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) nounwind readnone
+
+define void @fabs_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.fabs(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.fabs(double) nounwind readnone
+
+;CHECK-LABEL: @copysign_f32(
+;CHECK: llvm.copysign.v4f32
+;CHECK: ret void
+define void @copysign_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx1, align 4
+  %call = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+
+define void @copysign_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.copysign(double %0, double %1) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.copysign(double, double) nounwind readnone
+
+;CHECK-LABEL: @floor_f32(
+;CHECK: llvm.floor.v4f32
+;CHECK: ret void
+define void @floor_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.floor.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.floor.f32(float) nounwind readnone
+
+;CHECK-LABEL: @floor_f64(
+;CHECK: llvm.floor.v4f64
+;CHECK: ret void
+define void @floor_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.floor.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.floor.f64(double) nounwind readnone
+
+;CHECK-LABEL: @ceil_f32(
+;CHECK: llvm.ceil.v4f32
+;CHECK: ret void
+define void @ceil_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.ceil.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.ceil.f32(float) nounwind readnone
+
+;CHECK-LABEL: @ceil_f64(
+;CHECK: llvm.ceil.v4f64
+;CHECK: ret void
+define void @ceil_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.ceil.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.ceil.f64(double) nounwind readnone
+
+;CHECK-LABEL: @trunc_f32(
+;CHECK: llvm.trunc.v4f32
+;CHECK: ret void
+define void @trunc_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.trunc.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.trunc.f32(float) nounwind readnone
+
+;CHECK-LABEL: @trunc_f64(
+;CHECK: llvm.trunc.v4f64
+;CHECK: ret void
+define void @trunc_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.trunc.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.trunc.f64(double) nounwind readnone
+
+;CHECK-LABEL: @rint_f32(
+;CHECK: llvm.rint.v4f32
+;CHECK: ret void
+define void @rint_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.rint.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.rint.f32(float) nounwind readnone
+
+;CHECK-LABEL: @rint_f64(
+;CHECK: llvm.rint.v4f64
+;CHECK: ret void
+define void @rint_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.rint.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.rint.f64(double) nounwind readnone
+
+;CHECK-LABEL: @nearbyint_f32(
+;CHECK: llvm.nearbyint.v4f32
+;CHECK: ret void
+define void @nearbyint_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.nearbyint.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.nearbyint.f32(float) nounwind readnone
+
+;CHECK-LABEL: @nearbyint_f64(
+;CHECK: llvm.nearbyint.v4f64
+;CHECK: ret void
+define void @nearbyint_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.nearbyint.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.nearbyint.f64(double) nounwind readnone
+
+;CHECK-LABEL: @round_f32(
+;CHECK: llvm.round.v4f32
+;CHECK: ret void
+define void @round_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.round.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.round.f32(float) nounwind readnone
+
+;CHECK-LABEL: @round_f64(
+;CHECK: llvm.round.v4f64
+;CHECK: ret void
+define void @round_f64(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.round.f64(double %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx2, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.round.f64(double) nounwind readnone
+
+;CHECK-LABEL: @fma_f32(
+;CHECK: llvm.fma.v4f32
+;CHECK: ret void
+define void @fma_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z, float* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %w, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %3 = tail call float @llvm.fma.f32(float %0, float %2, float %1)
+  %arrayidx6 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %3, float* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+
+;CHECK-LABEL: @fma_f64(
+;CHECK: llvm.fma.v4f64
+;CHECK: ret void
+define void @fma_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z, double* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %w, i64 %indvars.iv
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %2 = load double, double* %arrayidx4, align 8
+  %3 = tail call double @llvm.fma.f64(double %0, double %2, double %1)
+  %arrayidx6 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %3, double* %arrayidx6, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
+
+;CHECK-LABEL: @fmuladd_f32(
+;CHECK: llvm.fmuladd.v4f32
+;CHECK: ret void
+define void @fmuladd_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z, float* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %w, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %2 = load float, float* %arrayidx4, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %2, float %1)
+  %arrayidx6 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %3, float* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+
+;CHECK-LABEL: @fmuladd_f64(
+;CHECK: llvm.fmuladd.v4f64
+;CHECK: ret void
+define void @fmuladd_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z, double* noalias %w) nounwind uwtable {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %w, i64 %indvars.iv
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %2 = load double, double* %arrayidx4, align 8
+  %3 = tail call double @llvm.fmuladd.f64(double %0, double %2, double %1)
+  %arrayidx6 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %3, double* %arrayidx6, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+
+;CHECK-LABEL: @pow_f32(
+;CHECK: llvm.pow.v4f32
+;CHECK: ret void
+define void @pow_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.pow.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.pow.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @pow_f64(
+;CHECK: llvm.pow.v4f64
+;CHECK: ret void
+define void @pow_f64(i32 %n, double* noalias %y, double* noalias %x, double* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %z, i64 %indvars.iv
+  %1 = load double, double* %arrayidx2, align 8
+  %call = tail call double @llvm.pow.f64(double %0, double %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK: fabs_libm
+; CHECK:  call <4 x float> @llvm.fabs.v4f32
+; CHECK: ret void
+define void @fabs_libm(float* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @fabsf(float %0) nounwind readnone
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare float @fabsf(float) nounwind readnone
+
+declare double @llvm.pow.f64(double, double) nounwind readnone
+
+
+
+; Make sure we don't replace calls to functions with standard library function
+; signatures but defined with internal linkage.
+
+define internal float @roundf(float %x) nounwind readnone {
+  ret float 0.00000000
+}
+; CHECK-LABEL: internal_round
+; CHECK-NOT:  load <4 x float>
+
+define void @internal_round(float* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @roundf(float %0) nounwind readnone
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Make sure we don't replace calls to functions with standard library names but
+; different signatures.
+
+declare void @round(double %f)
+
+; CHECK-LABEL: wrong_signature
+; CHECK-NOT:  load <4 x double>
+
+define void @wrong_signature(double* nocapture %x) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  store double %0, double* %arrayidx, align 4
+  tail call void @round(double %0) nounwind readnone
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare double @llvm.powi.f64(double %Val, i32 %power) nounwind readnone
+
+;CHECK-LABEL: @powi_f64(
+;CHECK: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64(i32 %n, double* noalias %y, double* noalias %x, i32 %P) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %call = tail call double @llvm.powi.f64(double %0, i32  %P) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @powi_f64_neg(
+;CHECK-NOT: llvm.powi.v4f64
+;CHECK: ret void
+define void @powi_f64_neg(i32 %n, double* noalias %y, double* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds double, double* %y, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 8
+  %1 = trunc i64 %indvars.iv to i32
+  %call = tail call double @llvm.powi.f64(double %0, i32  %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds double, double* %x, i64 %indvars.iv
+  store double %call, double* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.cttz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @cttz_f64(
+;CHECK: llvm.cttz.v4i64
+;CHECK: ret void
+define void @cttz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %y, i64 %indvars.iv
+  %0 = load i64, i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64, i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64  @llvm.ctlz.i64 (i64, i1) nounwind readnone
+
+;CHECK-LABEL: @ctlz_f64(
+;CHECK: llvm.ctlz.v4i64
+;CHECK: ret void
+define void @ctlz_f64(i32 %n, i64* noalias %y, i64* noalias %x) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %y, i64 %indvars.iv
+  %0 = load i64, i64* %arrayidx, align 8
+  %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone
+  %arrayidx4 = getelementptr inbounds i64, i64* %x, i64 %indvars.iv
+  store i64 %call, i64* %arrayidx4, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @llvm.fshl.i32 (i32, i32, i32)
+
+define void @fshl_i32(i32 %n, i32* noalias %x, i32* noalias %y, i32 %shAmt) {
+; CHECK-LABEL: @fshl_i32(
+; CHECK:         call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
+; CHECK:         ret void
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %end
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %xi = getelementptr inbounds i32, i32* %x, i32 %iv
+  %yi = getelementptr inbounds i32, i32* %y, i32 %iv
+  %xld = load i32, i32* %xi, align 4
+  %yld = load i32, i32* %yi, align 4
+  %call = tail call i32 @llvm.fshl.i32(i32 %xld, i32 %yld, i32 %shAmt)
+  store i32 %call, i32* %xi, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
+declare i32 @llvm.fshr.i32 (i32, i32, i32)
+
+define void @fshr_i32(i32 %n, i32* noalias %x, i32* noalias %y, i32 %shAmt) {
+; CHECK-LABEL: @fshr_i32(
+; CHECK:         call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
+; CHECK:         ret void
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %loop, label %end
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %xi = getelementptr inbounds i32, i32* %x, i32 %iv
+  %yi = getelementptr inbounds i32, i32* %y, i32 %iv
+  %xld = load i32, i32* %xi, align 4
+  %yld = load i32, i32* %yi, align 4
+  %call = tail call i32 @llvm.fshr.i32(i32 %xld, i32 %yld, i32 %shAmt)
+  store i32 %call, i32* %xi, align 4
+  %iv.next = add i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, %n
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret void
+}
+
+declare float @llvm.minnum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @minnum_f32(
+;CHECK: llvm.minnum.v4f32
+;CHECK: ret void
+define void @minnum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.minnum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.maxnum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @maxnum_f32(
+;CHECK: llvm.maxnum.v4f32
+;CHECK: ret void
+define void @maxnum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.maxnum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.minimum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @minimum_f32(
+;CHECK: llvm.minimum.v4f32
+;CHECK: ret void
+define void @minimum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.minimum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare float @llvm.maximum.f32(float, float) nounwind readnone
+
+;CHECK-LABEL: @maximum_f32(
+;CHECK: llvm.maximum.v4f32
+;CHECK: ret void
+define void @maximum_f32(i32 %n, float* noalias %y, float* noalias %x, float* noalias %z) nounwind uwtable {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %z, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %call = tail call float @llvm.maximum.f32(float %0, float %1) nounwind readnone
+  %arrayidx4 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
new file mode 100644
index 00000000000..cc4c55c3f44
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll
@@ -0,0 +1,593 @@
+; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
+; not hoist/sink the invariant stores. Even if that changes, we should still
+; vectorize this loop in case licm is not run.
+
+; The next licm pass after vectorization is to hoist/sink loop invariant
+; instructions.
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; all tests check that it is legal to vectorize the stores to invariant
+; address.
+
+
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction(
+; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
+; CHECK: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <4 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <4 x i32>
+; CHECK:         [[ADD]] = add <4 x i32> %vec.phi, %wide.load
+; CHECK-NEXT:    store i32 %ntrunc, i32* %a
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <4 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; CHECK-LABEL: inv_val_store_to_inv_address(
+; CHECK-LABEL: vector.body:
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK:         store <4 x i32>
+; CHECK-NEXT:    %index.next = add i64 %index, 4
+; CHECK-NEXT:    icmp eq i64 %index.next, %n.vec
+; CHECK-NEXT:    br i1
+define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %a
+  store i32 %ntrunc, i32* %tmp1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+; Both of these tests below are handled as predicated stores.
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+; TODO: We can be better with the code gen for the first test and we can have
+; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
+
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional(
+; CHECK-LABEL: vector.body:
+; CHECK:           %wide.load = load <4 x i32>, <4 x i32>*
+; CHECK:           [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
+; CHECK:           store <4 x i32>
+; CHECK-NEXT:      [[EE:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 0
+; CHECK-NEXT:      br i1 [[EE]], label %pred.store.if, label %pred.store.continue
+
+; CHECK-LABEL: pred.store.if:
+; CHECK-NEXT:      store i32 %ntrunc, i32* %a
+; CHECK-NEXT:      br label %pred.store.continue
+
+; CHECK-LABEL: pred.store.continue:
+; CHECK-NEXT:      [[EE1:%[a-zA-Z0-9.]+]] =  extractelement <4 x i1> [[CMP]], i32 1
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; if (b[i] == k)
+;    a = ntrunc
+; else a = k;
+; TODO: We could vectorize this once we support multiple uniform stores to the
+; same address.
+; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
+; CHECK-NOT:           load <4 x i32>
+define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+cond_store_k:
+  store i32 %k, i32 * %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Instcombine'd version of above test. Now the store is no longer of invariant
+; value.
+; scalar store the value extracted from the last element of the vector value.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x i32> [[BROADCAST_SPLAT8]], <4 x i32> [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; invariant val stored to invariant address predicated on invariant condition
+; This is not treated as a predicated store since the block the store belongs to
+; is the latch block (which doesn't need to be predicated).
+; variant/invariant values being stored to invariant address.
+; test checks that the last element of the phi is extracted and scalar stored
+; into the uniform address within the loop.
+; Since the condition and the phi is loop invariant, they are LICM'ed after
+; vectorization.
+; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[CMP]], i32 3
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[K]], i32 3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[BROADCAST_SPLAT6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[A]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[COND_STORE_K:%.*]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       cond_store_k:
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[STOREVAL:%.*]] = phi i32 [ [[NTRUNC]], [[COND_STORE]] ], [ [[K]], [[COND_STORE_K]] ]
+; CHECK-NEXT:    store i32 [[STOREVAL]], i32* [[A]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  %cmp = icmp eq i32 %ntrunc, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %cond_store_k
+
+cond_store:
+  br label %latch
+
+cond_store_k:
+  br label %latch
+
+latch:
+  %storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
+  store i32 %storeval, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; variant value stored to uniform address tests that the code gen extracts the
+; last element from the variant vector and scalar stores it into the uniform
+; address.
+; CHECK-LABEL: variant_val_store_to_inv_address
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX3:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX3]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B2]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[DOTLCSSA]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[DOTLCSSA]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[TMP3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  %cmp = icmp eq i32 %ntrunc, %k
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  store i32 %tmp2, i32* %a
+  %tmp3 = add i32 %tmp0, %tmp2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %rdx.lcssa = phi i32 [ %tmp3, %for.body ]
+  ret i32 %rdx.lcssa
+}
+
+; Multiple variant stores to the same uniform address
+; We do not vectorize such loops currently.
+;  for(; i < itr; i++) {
+;    for(; j < itr; j++) {
+;      var1[i] = var2[j] + var1[i];
+;      var1[i]++;
+;    }
+;  }
+
+; CHECK-LABEL: multiple_uniform_stores
+; CHECK-NOT:     <4 x i32>
+define i32 @multiple_uniform_stores(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  store i32 %4, i32* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %for.body3 ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
+
+; second uniform store to the same address is conditional.
+; we do not vectorize this.
+; CHECK-LABEL: multiple_uniform_stores_conditional
+; CHECK-NOT:    <4 x i32>
+define i32 @multiple_uniform_stores_conditional(i32* nocapture %var1, i32* nocapture readonly %var2, i32 %itr) #0 {
+entry:
+  %cmp20 = icmp eq i32 %itr, 0
+  br i1 %cmp20, label %for.end10, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %indvars.iv23 = phi i64 [ %indvars.iv.next24, %for.inc8 ], [ 0, %entry ]
+  %j.022 = phi i32 [ %j.1.lcssa, %for.inc8 ], [ 0, %entry ]
+  %cmp218 = icmp ult i32 %j.022, %itr
+  br i1 %cmp218, label %for.body3.lr.ph, label %for.inc8
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %arrayidx5 = getelementptr inbounds i32, i32* %var1, i64 %indvars.iv23
+  %0 = zext i32 %j.022 to i64
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ %0, %for.body3.lr.ph ], [ %indvars.iv.next, %latch ]
+  %arrayidx = getelementptr inbounds i32, i32* %var2, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, i32* %arrayidx5, align 4
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nsw i32 %3, 1
+  %5 = icmp ugt i32 %3, 42
+  br i1 %5, label %cond_store, label %latch
+
+cond_store:
+  store i32 %4, i32* %arrayidx5, align 4
+  br label %latch
+
+latch:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %itr
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3, %for.cond1.preheader
+  %j.1.lcssa = phi i32 [ %j.022, %for.cond1.preheader ], [ %itr, %latch ]
+  %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next24 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, %itr
+  br i1 %exitcond26, label %for.end10, label %for.cond1.preheader
+
+for.end10:                                        ; preds = %for.inc8, %entry
+  ret i32 undef
+}
+
+; cannot vectorize loop with unsafe dependency between uniform load (%tmp10) and store
+; (%tmp12) to the same address
+; PR39653
+; Note: %tmp10 could be replaced by phi(%arg4, %tmp12), a potentially vectorizable
+; 1st-order-recurrence
+define void @unsafe_dep_uniform_load_store(i32 %arg, i32 %arg1, i64 %arg2, i16* %arg3, i32 %arg4, i64 %arg5) {
+; CHECK-LABEL: unsafe_dep_uniform_load_store
+; CHECK-NOT: <4 x i32>
+bb:
+  %tmp = alloca i32
+  store i32 %arg4, i32* %tmp
+  %tmp6 = getelementptr inbounds i16, i16* %arg3, i64 %arg5
+  br label %bb7
+
+bb7:
+  %tmp8 = phi i64 [ 0, %bb ], [ %tmp24, %bb7 ]
+  %tmp9 = phi i32 [ %arg1, %bb ], [ %tmp23, %bb7 ]
+  %tmp10 = load i32, i32* %tmp
+  %tmp11 = mul nsw i32 %tmp9, %tmp10
+  %tmp12 = srem i32 %tmp11, 65536
+  %tmp13 = add nsw i32 %tmp12, %tmp9
+  %tmp14 = trunc i32 %tmp13 to i16
+  %tmp15 = trunc i64 %tmp8 to i32
+  %tmp16 = add i32 %arg, %tmp15
+  %tmp17 = zext i32 %tmp16 to i64
+  %tmp18 = getelementptr inbounds i16, i16* %tmp6, i64 %tmp17
+  store i16 %tmp14, i16* %tmp18, align 2
+  %tmp19 = add i32 %tmp13, %tmp9
+  %tmp20 = trunc i32 %tmp19 to i16
+  %tmp21 = and i16 %tmp20, 255
+  %tmp22 = getelementptr inbounds i16, i16* %arg3, i64 %tmp17
+  store i16 %tmp21, i16* %tmp22, align 2
+  %tmp23 = add nsw i32 %tmp9, 1
+  %tmp24 = add nuw nsw i64 %tmp8, 1
+  %tmp25 = icmp eq i64 %tmp24, %arg2
+  store i32 %tmp12, i32* %tmp
+  br i1 %tmp25, label %bb26, label %bb7
+
+bb26:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
new file mode 100644
index 00000000000..e4385940e75
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -0,0 +1,176 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck %s
+
+; CHECK-LABEL: @postinc
+; CHECK-LABEL: scalar.ph:
+; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ]
+; CHECK: ret i32 %[[RET]]
+define i32 @postinc(i32 %k)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc
+}
+
+; CHECK-LABEL: @preinc
+; CHECK-LABEL: middle.block:
+; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1
+; CHECK-LABEL: scalar.ph:
+; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %[[v3]], %middle.block ]
+; CHECK: ret i32 %[[RET]]
+define i32 @preinc(i32 %k)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.phi
+}
+
+; CHECK-LABEL: @constpre
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ 2, %middle.block ]
+; CHECK: ret i32 %[[RET]]
+define i32 @constpre()  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 32, %entry ], [ %inc, %for.body ]
+  %inc = sub nsw i32 %inc.phi, 2
+  %cmp = icmp eq i32 %inc, 0
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %inc.phi
+}
+
+; CHECK-LABEL: @geppre
+; CHECK-LABEL: middle.block:
+; CHECK: %ind.escape = getelementptr i32, i32* %ptr, i64 124
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32* [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ]
+; CHECK: ret i32* %[[RET]]
+define i32* @geppre(i32* %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %ptr.phi = phi i32* [ %ptr, %entry ], [ %inc.ptr, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %inc.ptr = getelementptr i32, i32* %ptr.phi, i32 4
+  %cmp = icmp eq i32 %inc, 32
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32* %ptr.phi
+}
+
+; CHECK-LABEL: @both
+; CHECK-LABEL: middle.block:
+; CHECK: %[[END:.*]] = sub i64 %n.vec, 1
+; CHECK: %ind.escape = getelementptr i32, i32* %base, i64 %[[END]]
+; CHECK-LABEL: for.end:
+; CHECK: %[[RET:.*]] = phi i32* [ %inc.lag1, %for.body ], [ %ind.escape, %middle.block ]
+; CHECK: ret i32* %[[RET]]
+
+define i32* @both(i32 %k)  {
+entry:
+  %base = getelementptr inbounds i32, i32* undef, i64 1
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc.lag1 = phi i32* [ %base, %entry ], [ %tmp, %for.body]
+  %inc.lag2 = phi i32* [ undef, %entry ], [ %inc.lag1, %for.body]  
+  %tmp = getelementptr inbounds i32, i32* %inc.lag1, i64 1    
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32* %inc.lag1
+}
+
+; CHECK-LABEL: @multiphi
+; CHECK-LABEL: scalar.ph:
+; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ]
+; CHECK-LABEL: for.end:
+; CHECK: %phi = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ]
+; CHECK: %phi2 = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ]
+; CHECK: store i32 %phi2, i32* %p
+; CHECK: ret i32 %phi
+define i32 @multiphi(i32 %k, i32* %p)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %inc = add nsw i32 %inc.phi, 1
+  %cmp = icmp eq i32 %inc, %k
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  %phi = phi i32 [ %inc, %for.body ]
+  %phi2 = phi i32 [ %inc, %for.body ]
+  store i32 %phi2, i32* %p
+  ret i32 %phi
+}
+
+; CHECK-LABEL: @PR30742
+; CHECK:   %[[T15:.+]] = add nsw i32 %tmp03, -7
+; CHECK: vector.ph
+; CHECK:   %[[N_MOD_VF:.+]] = urem i32 %[[T5:.+]], 2
+; CHECK:   %[[N_VEC:.+]] = sub i32 %[[T5]], %[[N_MOD_VF]]
+; CHECK: middle.block
+; CHECK:   %[[CMP:.+]] = icmp eq i32 %[[T5]], %[[N_VEC]]
+; CHECK:   %ind.escape = add i32 %[[T15]],
+; CHECK:   br i1 %[[CMP]], label %BB3, label %scalar.ph
+define void @PR30742() {
+BB0:
+  br label %BB1
+
+BB1:
+  %tmp00 = load i32, i32* undef, align 16
+  %tmp01 = sub i32 %tmp00, undef
+  %tmp02 = icmp slt i32 %tmp01, 1
+  %tmp03 = select i1 %tmp02, i32 1, i32 %tmp01
+  %tmp04 = add nsw i32 %tmp03, -7
+  br label %BB2
+
+BB2:
+  %tmp05 = phi i32 [ %tmp04, %BB1 ], [ %tmp06, %BB2 ]
+  %tmp06 = add i32 %tmp05, -8
+  %tmp07 = icmp sgt i32 %tmp06, 0
+  br i1 %tmp07, label %BB2, label %BB3
+
+BB3:
+  %tmp08 = phi i32 [ %tmp05, %BB2 ]
+  %tmp09 = sub i32 %tmp00, undef
+  %tmp10 = icmp slt i32 %tmp09, 1
+  %tmp11 = select i1 %tmp10, i32 1, i32 %tmp09
+  %tmp12 = add nsw i32 %tmp11, -7
+  br label %BB4
+
+BB4:
+  %tmp13 = phi i32 [ %tmp12, %BB3 ], [ %tmp14, %BB4 ]
+  %tmp14 = add i32 %tmp13, -8
+  %tmp15 = icmp sgt i32 %tmp14, 0
+  br i1 %tmp15, label %BB4, label %BB1
+}
diff --git a/llvm/test/Transforms/LoopVectorize/lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/lcssa-crash.ll
new file mode 100644
index 00000000000..3d3ef9e0593
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/lcssa-crash.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%type1 = type { %type2 }
+%type2 = type { [0 x i8*], i8**, i32, i32, i32 }
+
+define void @test() nounwind uwtable align 2 {
+  br label %for.body.lr.ph.i.i.i
+
+for.body.lr.ph.i.i.i:
+  br label %for.body.i.i.i
+
+for.body.i.i.i:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc.i.i.i ], [ 0, %for.body.lr.ph.i.i.i ]
+  br label %for.inc.i.i.i
+
+for.inc.i.i.i:
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, undef
+  br i1 %exitcond, label %for.body.i.i.i, label %for.end.i.i.i
+
+for.end.i.i.i:
+  %lcssa = phi %type1* [ undef, %for.inc.i.i.i ]
+  unreachable
+}
+
+; PR16139
+define void @test2(i8* %x) {
+entry:
+  indirectbr i8* %x, [ label %L0, label %L1 ]
+
+L0:
+  br label %L0
+
+L1:
+  ret void
+}
+
+; This loop has different uniform instructions before and after LCSSA.
+define void @test3() {
+entry:
+  %add41 = add i32 undef, undef
+  %idxprom4736 = zext i32 %add41 to i64
+  br label %while.body
+
+while.body:
+  %idxprom4738 = phi i64 [ %idxprom47, %while.body ], [ %idxprom4736, %entry ]
+  %pos.337 = phi i32 [ %inc46, %while.body ], [ %add41, %entry ]
+  %inc46 = add i32 %pos.337, 1
+  %arrayidx48 = getelementptr inbounds [1024 x i8], [1024 x i8]* undef, i64 0, i64 %idxprom4738
+  store i8 0, i8* %arrayidx48, align 1
+  %and43 = and i32 %inc46, 3
+  %cmp44 = icmp eq i32 %and43, 0
+  %idxprom47 = zext i32 %inc46 to i64
+  br i1 %cmp44, label %while.end, label %while.body
+
+while.end:
+  %add58 = add i32 %inc46, 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/legal_preheader_check.ll b/llvm/test/Transforms/LoopVectorize/legal_preheader_check.ll
new file mode 100644
index 00000000000..32aa796394d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/legal_preheader_check.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -debug -S -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; D40973
+; Make sure LV legal bails out when the loop doesn't have a legal pre-header.
+
+; CHECK: LV: Loop doesn't have a legal pre-header.
+
+define void @inc(i32 %n, i8* %P) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %BB1, label %BB2
+
+BB1:
+  indirectbr i8* %P, [label %.lr.ph]
+
+BB2:
+  br label %.lr.ph
+
+.lr.ph:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %BB1 ], [ 0, %BB2 ]
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/libcall-remark.ll b/llvm/test/Transforms/LoopVectorize/libcall-remark.ll
new file mode 100644
index 00000000000..a1a7e461c70
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/libcall-remark.ll
@@ -0,0 +1,52 @@
+; RUN: opt -S -loop-vectorize < %s 2>&1 -pass-remarks-analysis=.* | FileCheck %s
+
+; Test the optimization remark emitter for recognition 
+; of a mathlib function vs. an arbitrary function.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+@data = external local_unnamed_addr global [32768 x float], align 16
+
+; CHECK: loop not vectorized: library call cannot be vectorized
+
+define void @libcall_blocks_vectorization() {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [32768 x float], [32768 x float]* @data, i64 0, i64 %indvars.iv
+  %t0 = load float, float* %arrayidx, align 4
+  %sqrtf = tail call float @sqrtf(float %t0)
+  store float %sqrtf, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 32768
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK: loop not vectorized: call instruction cannot be vectorized
+
+define void @arbitrary_call_blocks_vectorization() {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [32768 x float], [32768 x float]* @data, i64 0, i64 %indvars.iv
+  %t0 = load float, float* %arrayidx, align 4
+  %sqrtf = tail call float @arbitrary(float %t0)
+  store float %sqrtf, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 32768
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare float @sqrtf(float)
+declare float @arbitrary(float)
+
diff --git a/llvm/test/Transforms/LoopVectorize/lifetime.ll b/llvm/test/Transforms/LoopVectorize/lifetime.ll
new file mode 100644
index 00000000000..860fe2d983c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/lifetime.ll
@@ -0,0 +1,96 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can vectorize loops which contain lifetime markers.
+
+; CHECK-LABEL: @test(
+; CHECK: call void @llvm.lifetime.end
+; CHECK: store <2 x i32>
+; CHECK: call void @llvm.lifetime.start
+
+define void @test(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  %0 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  ret void
+}
+
+; CHECK-LABEL: @testbitcast(
+; CHECK: call void @llvm.lifetime.end
+; CHECK: store <2 x i32>
+; CHECK: call void @llvm.lifetime.start
+
+define void @testbitcast(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  %0 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %1 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  ret void
+}
+
+; CHECK-LABEL: @testloopvariant(
+; CHECK: call void @llvm.lifetime.end
+; CHECK: store <2 x i32>
+; CHECK: call void @llvm.lifetime.start
+
+define void @testloopvariant(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv
+  %1 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll
new file mode 100644
index 00000000000..3bbe8100e34
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that we vectorize only bottom-tested loops.
+; This is a reduced testcase from PR21302.
+;
+; rdar://problem/18886083
+
+%struct.X = type { i32, i16 }
+; CHECK-LABEL: @foo(
+; CHECK-NOT: vector.body
+
+define void @foo(i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i, %n
+  br i1 %cmp, label %for.body, label %if.end
+
+for.body:
+  %iprom = sext i32 %i to i64
+  %b = getelementptr inbounds %struct.X, %struct.X* undef, i64 %iprom, i32 1
+  store i16 0, i16* %b, align 4
+  %inc = add nsw i32 %i, 1
+  br label %for.cond
+
+if.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/loop-scalars.ll b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
new file mode 100644
index 00000000000..4dcd5993c12
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -0,0 +1,143 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: vector_gep
+; CHECK-NOT:   LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
+; CHECK-NEXT:    store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @vector_gep(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: scalar_store
+; CHECK:       LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    store i32* [[TMP5]], i32** [[TMP7]], align 8
+; CHECK-NEXT:    store i32* [[TMP6]], i32** [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @scalar_store(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: expansion
+; CHECK:       LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32*
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64**
+; CHECK-NEXT:    store i64* [[TMP5]], i64** [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64**
+; CHECK-NEXT:    store i64* [[TMP6]], i64** [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @expansion(i32** %a, i64 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i32*
+  %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+  %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+  store i32* %tmp1, i32** %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: no_gep_or_bitcast
+; CHECK-NOT:   LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8
+; CHECK:       LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i32 0, i32* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store i32 0, i32* [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i
+  %tmp1 = load i32*, i32** %tmp0, align 8
+  store i32 0, i32* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll b/llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll
new file mode 100644
index 00000000000..d9efaa5c085
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/loop-vect-memdep.ll
@@ -0,0 +1,26 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; RUN: opt < %s -S -loop-vectorize -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; CHECK: LV: Can't vectorize due to memory conflicts
+
+define void @test_loop_novect(double** %arr, i64 %n) {
+for.body.lr.ph:
+  %t = load double*, double** %arr, align 8
+  br label %for.body
+
+for.body:                                      ; preds = %for.body, %for.body.lr.ph
+  %i = phi i64 [ 0, %for.body.lr.ph ], [ %i.next, %for.body ]
+  %a = getelementptr inbounds double, double* %t, i64 %i
+  %i.next = add nuw nsw i64 %i, 1
+  %a.next = getelementptr inbounds double, double* %t, i64 %i.next
+  %t1 = load double, double* %a, align 8
+  %t2 = load double, double* %a.next, align 8
+  store double %t1, double* %a.next, align 8
+  store double %t2, double* %a, align 8
+  %c = icmp eq i64 %i, %n
+  br i1 %c, label %final, label %for.body
+
+final:                                   ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/memdep.ll b/llvm/test/Transforms/LoopVectorize/memdep.ll
new file mode 100644
index 00000000000..bea9cc39ab4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/memdep.ll
@@ -0,0 +1,273 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=WIDTH
+; RUN: opt -S -loop-vectorize -force-vector-width=4 < %s | FileCheck %s -check-prefix=RIGHTVF
+; RUN: opt -S -loop-vectorize -force-vector-width=8 < %s | FileCheck %s -check-prefix=WRONGVF
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Vectorization with dependence checks.
+
+; No plausible dependence - can be vectorized.
+;  for (i = 0; i < 1024; ++i)
+;    A[i] = A[i + 1] + 1;
+
+; CHECK-LABEL: @f1_vec(
+; CHECK: <2 x i32>
+
+define void @f1_vec(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next
+  %0 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  store i32 %add1, i32* %arrayidx3, align 4
+  %exitcond = icmp ne i32 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 1 - can't be vectorized.
+;  for (i = 0; i < 1024; ++i)
+;    A[i+1] = A[i] + 1;
+
+; CHECK-LABEL: @f2_novec(
+; CHECK-NOT: <2 x i32>
+
+define void @f2_novec(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next
+  store i32 %add, i32* %arrayidx3, align 4
+  %exitcond = icmp ne i32 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 2 - can be vectorized with a width of 2.
+;  for (i = 0; i < 1024; ++i)
+;    A[i+2] = A[i] + 1;
+
+; CHECK-LABEL: @f3_vec_len(
+; CHECK: <2 x i32>
+
+; WIDTH: f3_vec_len
+; WIDTH-NOT: <4 x i32>
+
+define void @f3_vec_len(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %add1 = add nsw i32 %i.01, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %idxprom2
+  store i32 %add, i32* %arrayidx3, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Plausible dependence of distance 1 - cannot be vectorized (without reordering
+; accesses).
+;   for (i = 0; i < 1024; ++i) {
+;     B[i] = A[i];
+;     A[i] = B[i + 1];
+;   }
+
+; CHECK-LABEL: @f5(
+; CHECK-NOT: <2 x i32>
+
+define void @f5(i32*  %A, i32* %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx4, align 4
+  store i32 %1, i32* %arrayidx, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Dependence through a phi node - must not vectorize.
+;   for (i = 0; i < 1024; ++i) {
+;     a[i+1] = tmp;
+;     tmp = a[i];
+;   }
+
+; CHECK-LABEL: @f6
+; CHECK-NOT: <2 x i32>
+
+define i32 @f6(i32* %a, i32 %tmp) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ]
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+  store i32 %tmp.addr.08, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx3, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret i32 undef
+}
+
+; Don't vectorize true loop carried dependencies that are not a multiple of the
+; vector width.
+; Example:
+;   for (int i = ...; ++i) {
+;     a[i] = a[i-3] + ...;
+; It is a bad idea to vectorize this loop because store-load forwarding will not
+; happen.
+;
+
+; CHECK-LABEL: @nostoreloadforward(
+; CHECK-NOT: <2 x i32>
+
+define void @nostoreloadforward(i32* %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = add nsw i64 %indvars.iv, -3
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = add nsw i64 %indvars.iv, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %2
+  %3 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %3, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add3, i32* %arrayidx5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Example:
+;   for (int i = ...; ++i) {
+;     a[i] = b[i];
+;     c[i] = a[i-3] + ...;
+; It is a bad idea to vectorize this loop because store-load forwarding will not
+; happen.
+;
+
+; CHECK-LABEL: @nostoreloadforward2(
+; CHECK-NOT: <2 x i32>
+
+define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %1 = add nsw i64 %indvars.iv, -3
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+
+;Check the new calculation of the maximum safe distance in bits which can be vectorized.
+;The previous behavior did not take account that the stride was 2.
+;Therefore the maxVF was computed as 8 instead of 4, as the dependence distance here is 6 iterations, given by |N-(N-12)|/2.  
+
+;#define M 32
+;#define N 2 * M
+;unsigned int a [N];
+;void pr34283(){
+;	unsigned int j=0;
+;   for (j = 0; j < M - 6; ++j)
+;    {
+;        a[N - 2 * j] = 69;
+;        a[N - 12 - 2 * j] = 7;
+;    }
+;
+;}
+
+; RIGHTVF-LABEL: @pr34283
+; RIGHTVF: <4 x i64>
+
+; WRONGVF-LABLE: @pr34283
+; WRONGVF-NOT: <8 x i64>
+
+@a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @pr34283() local_unnamed_addr {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl i64 %indvars.iv, 1
+  %1 = sub nuw nsw i64 64, %0
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %1
+  store i32 69, i32* %arrayidx, align 8
+  %2 = sub nuw nsw i64 52, %0
+  %arrayidx4 = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %2
+  store i32 7, i32* %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 26
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/metadata-unroll.ll b/llvm/test/Transforms/LoopVectorize/metadata-unroll.ll
new file mode 100644
index 00000000000..c2b6b5cab83
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/metadata-unroll.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK-LABEL: @inc(
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph, !llvm.loop !0
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
new file mode 100644
index 00000000000..455a4e1953f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @test1(
+; CHECK: store <8 x i32>
+; CHECK: ret void
+define void @test1(i32* nocapture %a, i32 %n) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
new file mode 100644
index 00000000000..8a20b49d25d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !tbaa !0
+  %conv = fptosi float %0 to i32
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; CHECK-LABEL: @test1
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa ![[TFLT:[0-9]+]]
+; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa ![[TINT:[0-9]+]]
+; CHECK: ret i32 0
+
+; CHECK-DAG: ![[TFLT]] = !{![[TFLT1:[0-9]+]]
+; CHECK-DAG: ![[TFLT1]] = !{!"float"
+
+; CHECK-DAG: ![[TINT]] = !{![[TINT1:[0-9]+]]
+; CHECK-DAG: ![[TINT1]] = !{!"int"
+
+attributes #0 = { nounwind uwtable }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
+
diff --git a/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
new file mode 100644
index 00000000000..662d8ed7499
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/middle-block-dbg.ll
@@ -0,0 +1,110 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Confirm that the DebugLoc info for the instructions in the middle block of a
+; vectorized loop are correct. The Cmp and Br instructions should map to the
+; same source lines as the Cmp and Br of the scalar loop.
+
+; CHECK-LABEL: middle.block:
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq {{.*}}!dbg ![[DL:[0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]]{{.*}} !dbg ![[DL]]
+; CHECK: ![[DL]] = !DILocation(line: 6,
+
+; This IR can be generated by running:
+; clang -g -O2 -emit-llvm -S -mllvm -opt-bisect-limit=68 vec.cpp -o - | opt -loop-vectorize -force-vector-width=2 -S -o vec.ll
+;
+; Where vec.cpp contains:
+;
+; extern int x;
+; extern int y;
+; void a() {
+;     const int len = x;
+;     int b[len];
+;     for(int i = 0; i< len; ++i)
+;         b[i] = x;
+;
+;     y = b[x] + b[x-5];
+; }
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+
+@"?x@@3HA" = external dso_local local_unnamed_addr global i32, align 4
+@"?y@@3HA" = external dso_local local_unnamed_addr global i32, align 4
+
+define dso_local void @"?a@@YAXXZ"() local_unnamed_addr #0 !dbg !8 {
+entry:
+  %0 = load i32, i32* @"?x@@3HA", align 4, !dbg !23, !tbaa !24
+  %1 = zext i32 %0 to i64, !dbg !28
+  %vla = alloca i32, i64 %1, align 16, !dbg !28
+  %cmp10 = icmp sgt i32 %0, 0, !dbg !30
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup, !dbg !30
+
+for.body.preheader:
+  br label %for.body, !dbg !31
+
+for.cond.cleanup.loopexit:
+  %idxprom1.phi.trans.insert = sext i32 %0 to i64
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %vla, i64 %idxprom1.phi.trans.insert
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4, !dbg !33, !tbaa !24
+  br label %for.cond.cleanup, !dbg !33
+
+for.cond.cleanup:
+  %2 = phi i32 [ %.pre, %for.cond.cleanup.loopexit ], [ undef, %entry ], !dbg !33
+  %sub = add nsw i32 %0, -5, !dbg !33
+  %idxprom3 = sext i32 %sub to i64, !dbg !33
+  %arrayidx4 = getelementptr inbounds i32, i32* %vla, i64 %idxprom3, !dbg !33
+  %3 = load i32, i32* %arrayidx4, align 4, !dbg !33, !tbaa !24
+  %add = add nsw i32 %3, %2, !dbg !33
+  store i32 %add, i32* @"?y@@3HA", align 4, !dbg !33, !tbaa !24
+  ret void, !dbg !34
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %vla, i64 %indvars.iv, !dbg !31
+  store i32 %0, i32* %arrayidx, align 4, !dbg !31, !tbaa !24
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !35
+  %exitcond = icmp eq i64 %indvars.iv.next, %1, !dbg !30
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !30, !llvm.loop !36
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 9.0.0 (https://github.com/llvm/llvm-project.git 045b8544fd2c4e14f7e72e0df2bc681d823b0838)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "vec.cpp", directory: "C:\5CUsers\5Cgbhyamso\5Cdev\5Cllvm\5Csamples", checksumkind: CSK_MD5, checksum: "fed997f50117f5514a69caf1c2fb2c49")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 9.0.0 (https://github.com/llvm/llvm-project.git 045b8544fd2c4e14f7e72e0df2bc681d823b0838)"}
+!8 = distinct !DISubprogram(name: "a", linkageName: "?a@@YAXXZ", scope: !1, file: !1, line: 3, type: !9, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !{!12, !15, !17, !21}
+!12 = !DILocalVariable(name: "len", scope: !8, file: !1, line: 4, type: !13)
+!13 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !14)
+!14 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!15 = !DILocalVariable(name: "__vla_expr0", scope: !8, type: !16, flags: DIFlagArtificial)
+!16 = !DIBasicType(name: "long long unsigned int", size: 64, encoding: DW_ATE_unsigned)
+!17 = !DILocalVariable(name: "b", scope: !8, file: !1, line: 5, type: !18)
+!18 = !DICompositeType(tag: DW_TAG_array_type, baseType: !14, elements: !19)
+!19 = !{!20}
+!20 = !DISubrange(count: !15)
+!21 = !DILocalVariable(name: "i", scope: !22, file: !1, line: 6, type: !14)
+!22 = distinct !DILexicalBlock(scope: !8, file: !1, line: 6)
+!23 = !DILocation(line: 4, scope: !8)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 5, scope: !8)
+!29 = !DILocation(line: 0, scope: !8)
+!30 = !DILocation(line: 6, scope: !22)
+!31 = !DILocation(line: 7, scope: !32)
+!32 = distinct !DILexicalBlock(scope: !22, file: !1, line: 6)
+!33 = !DILocation(line: 9, scope: !8)
+!34 = !DILocation(line: 10, scope: !8)
+!35 = !DILocation(line: 6, scope: !32)
+!36 = distinct !{!36, !30, !37}
+!37 = !DILocation(line: 7, scope: !22)
diff --git a/llvm/test/Transforms/LoopVectorize/miniters.ll b/llvm/test/Transforms/LoopVectorize/miniters.ll
new file mode 100644
index 00000000000..f5f4eb5eaa0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/miniters.ll
@@ -0,0 +1,44 @@
+; RUN: opt %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; RUN: opt %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s -check-prefix=UNROLL
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@b = common global [1000 x i32] zeroinitializer, align 16
+@c = common global [1000 x i32] zeroinitializer, align 16
+@a = common global [1000 x i32] zeroinitializer, align 16
+
+; Generate min.iters.check to skip the vector loop and jump to scalar.ph directly when loop iteration number is less than VF * UF.
+; CHECK-LABEL: foo(
+; CHECK: %min.iters.check = icmp ult i64 %N, 4
+; CHECK: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+; UNROLL-LABEL: foo(
+; UNROLL: %min.iters.check = icmp ult i64 %N, 8
+; UNROLL: br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+define void @foo(i64 %N) {
+entry:
+  %cmp.8 = icmp sgt i64 %N, 0
+  br i1 %cmp.8, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %i.09 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @b, i64 0, i64 %i.09
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds [1000 x i32], [1000 x i32]* @c, i64 0, i64 %i.09
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %i.09
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add nuw nsw i64 %i.09, 1
+  %exitcond = icmp eq i64 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
new file mode 100644
index 00000000000..188700d66d8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -0,0 +1,885 @@
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-interleave=1  < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@A = common global [1024 x i32] zeroinitializer, align 16
+@fA = common global [1024 x float] zeroinitializer, align 16
+@dA = common global [1024 x double] zeroinitializer, align 16
+
+; Signed tests.
+
+; Turn this into a max reduction. Make sure we use a splat to initialize the
+; vector for the reduction.
+; CHECK-LABEL: @max_red(
+; CHECK: %[[VAR:.*]] = insertelement <2 x i32> undef, i32 %max, i32 0
+; CHECK: {{.*}} = shufflevector <2 x i32> %[[VAR]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK-LABEL: @max_red_inverse_select(
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK-LABEL: @min_red(
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK-LABEL: @min_red_inverse_select(
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Unsigned tests.
+
+; Turn this into a max reduction.
+; CHECK-LABEL: @umax_red(
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK-LABEL: @umax_red_inverse_select(
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK-LABEL: @umin_red(
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK-LABEL: @umin_red_inverse_select(
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SGE -> SLT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK-LABEL: @sge_min_red(
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SLE -> SGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK-LABEL: @sle_min_red(
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sle_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp sle i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; UGE -> ULT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK-LABEL: @uge_min_red(
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @uge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp uge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; ULE -> UGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK-LABEL: @ule_min_red(
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @ule_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %cmp3 = icmp ule i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; No reduction.
+; CHECK-LABEL: @no_red_1(
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_1(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; CHECK-LABEL: @no_red_2(
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_2(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Float tests.
+
+; Maximum.
+
+; Turn this into a max reduction in the presence of a no-nans-fp-math attribute.
+; CHECK-LABEL: @max_red_float(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @max_red_float_ge(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast oge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_max_red_float(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast olt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_max_red_float_le(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ole float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @unordered_max_red_float(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ugt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @unordered_max_red_float_ge(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast uge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_max_red_float(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ult float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_max_red_float_le(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ule float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; Minimum.
+
+; Turn this into a min reduction in the presence of a no-nans-fp-math attribute.
+; CHECK-LABEL: @min_red_float(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast olt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @min_red_float_le(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ole float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_min_red_float(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ogt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_min_red_float_ge(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast oge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @unordered_min_red_float(
+; CHECK: fcmp fast oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ult float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @unordered_min_red_float_le(
+; CHECK: fcmp fast ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ule float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_min_red_float(
+; CHECK: fcmp fast ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ugt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK-LABEL: @inverted_unordered_min_red_float_ge(
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast uge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; Make sure we handle doubles, too.
+; CHECK-LABEL: @min_red_double(
+; CHECK: fcmp fast olt <2 x double>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp fast olt <2 x double>
+; CHECK: select <2 x i1>
+
+define double @min_red_double(double %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi double [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x double], [1024 x double]* @dA, i64 0, i64 %indvars.iv
+  %0 = load double, double* %arrayidx, align 4
+  %cmp3 = fcmp fast olt double %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, double %0, double %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret double %min.red.0
+}
+
+
+; Don't this into a max reduction. The no-nans-fp-math attribute is missing
+; CHECK-LABEL: @max_red_float_nans(
+; CHECK-NOT: <2 x float>
+
+define float @max_red_float_nans(float %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float], [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp3 = fcmp fast ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+
+attributes #0 = { "no-nans-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
new file mode 100644
index 00000000000..4fdb0178410
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/multi-use-reduction-bug.ll
@@ -0,0 +1,41 @@
+; RUN: opt -indvars -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; We must not vectorize this loop. %add55 is not reduction. Its value is used
+; multiple times.
+
+; PR18526
+
+; CHECK: multiple_use_of_value
+; CHECK-NOT: <2 x i32>
+
+define void @multiple_use_of_value() {
+entry:
+  %n = alloca i32, align 4
+  %k7 = alloca i32, align 4
+  %nf = alloca i32, align 4
+  %0 = load i32, i32* %k7, align 4
+  %.neg1 = sub i32 0, %0
+  %n.promoted = load i32, i32* %n, align 4
+  %nf.promoted = load i32, i32* %nf, align 4
+  br label %for.body
+
+for.body:
+  %inc107 = phi i32 [ undef, %entry ], [ %inc10, %for.body ]
+  %inc6 = phi i32 [ %nf.promoted, %entry ], [ undef, %for.body ]
+  %add55 = phi i32 [ %n.promoted, %entry ], [ %add5, %for.body ]
+  %.neg2 = sub i32 0, %inc6
+  %add.neg = add i32 0, %add55
+  %add4.neg = add i32 %add.neg, %.neg1
+  %sub = add i32 %add4.neg, %.neg2
+  %add5 = add i32 %sub, %add55
+  %inc10 = add i32 %inc107, 1
+  %cmp = icmp ult i32 %inc10, 61
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  store i32 %add5.lcssa, i32* %n, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
new file mode 100644
index 00000000000..cb6478f1030
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; From a simple program with two address spaces:
+; char Y[4*10000] __attribute__((address_space(1)));
+; char X[4*10000];
+; int main() {
+;    for (int i = 0; i < 4*10000; ++i)
+;        X[i] = Y[i] + 1;
+;    return 0;
+;}
+
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@Y = common addrspace(1) global [40000 x i8] zeroinitializer, align 16
+@X = common global [40000 x i8] zeroinitializer, align 16
+
+;CHECK-LABEL: @main(
+;CHECK: bitcast i8 addrspace(1)* %{{.*}} to <4 x i8> addrspace(1)*
+;CHECK: bitcast i8* %{{.*}} to <4 x i8>*
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [40000 x i8], [40000 x i8] addrspace(1)* @Y, i64 0, i64 %indvars.iv
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add = add i8 %0, 1
+  %arrayidx3 = getelementptr inbounds [40000 x i8], [40000 x i8]* @X, i64 0, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 40000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
new file mode 100644
index 00000000000..33801cab2fe
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
@@ -0,0 +1,64 @@
+; RUN: opt -loop-vectorize -force-vector-width=4 -S < %s | FileCheck %s
+
+; This is the test case from PR26314.
+; When we were retrying dependence checking with memchecks only,
+; the loop-invariant access in the inner loop was incorrectly determined to be wrapping
+; because it was not strided in the inner loop.
+; Improved wrapping detection allows vectorization in the following case.
+
+; #define Z 32
+; typedef struct s {
+;       int v1[Z];
+;       int v2[Z];
+;       int v3[Z][Z];
+; } s;
+;
+; void slow_function (s* const obj, int z) {
+;    for (int j=0; j<Z; j++) {
+;        for (int k=0; k<z; k++) {
+;            int x = obj->v1[k] + obj->v2[j];
+;            obj->v3[j][k] += x;
+;        }
+;    }
+; }
+
+; CHECK-LABEL: Test
+; CHECK: <4 x i64>
+; CHECK: <4 x i32>, <4 x i32>
+; CHECK: !{!"llvm.loop.isvectorized", i32 1}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }
+
+define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
+  br label %.outer.preheader
+
+
+.outer.preheader:
+  %i = phi i64 [ 0, %0 ], [ %i.next, %.outer ]
+  %1 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 1, i64 %i
+  br label %.inner
+
+.exit:
+  ret void
+ 
+.outer:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond.outer = icmp eq i64 %i.next, 32
+  br i1 %exitcond.outer, label %.exit, label %.outer.preheader
+
+.inner:
+  %j = phi i64 [ 0, %.outer.preheader ], [ %j.next, %.inner ]
+  %2 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 0, i64 %j
+  %3 = load i32, i32* %2
+  %4 = load i32, i32* %1
+  %5 = add nsw i32 %4, %3
+  %6 = getelementptr inbounds %struct.s, %struct.s* %obj, i64 0, i32 2, i64 %i, i64 %j
+  %7 = load i32, i32* %6
+  %8 = add nsw i32 %5, %7
+  store i32 %8, i32* %6  
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond.inner = icmp eq i64 %j.next, %z
+  br i1 %exitcond.inner, label %.outer, label %.inner
+}
diff --git a/llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll b/llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll
new file mode 100644
index 00000000000..a0c53450482
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no-interleave-up-front.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test case based on reproducer for
+; http://bugs.chromium.org/p/oss-fuzz/issues/detail?id=6477
+
+define void @test1(i32 %n) #0 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    br i1 false, label [[DOTLR_PH_PREHEADER:%.*]], label [[DOT_CRIT_EDGE:%.*]]
+; CHECK:       .lr.ph.preheader:
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]], !llvm.loop !0
+; CHECK:       ._crit_edge.loopexit:
+; CHECK-NEXT:    br label [[DOT_CRIT_EDGE]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret void
+;
+  br i1 false, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  br i1 true, label %._crit_edge, label %.lr.ph, !llvm.loop !0
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interleave.count", i32 2}
diff --git a/llvm/test/Transforms/LoopVectorize/no_array_bounds.ll b/llvm/test/Transforms/LoopVectorize/no_array_bounds.ll
new file mode 100644
index 00000000000..c6a2431eba5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no_array_bounds.ll
@@ -0,0 +1,100 @@
+; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s
+
+; Verify warning is generated when vectorization/ interleaving is explicitly specified and fails to occur.
+; CHECK: warning: no_array_bounds.cpp:5:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+; CHECK: warning: no_array_bounds.cpp:10:5: loop not interleaved: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+;  #pragma clang loop vectorize(enable)
+;  for (int i = 0; i < number; i++) {
+;    A[B[i]]++;
+;  }
+
+;  #pragma clang loop vectorize(disable) interleave(enable)
+;  for (int i = 0; i < number; i++) {
+;    B[A[i]]++;
+;  }
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind ssp uwtable
+define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 !dbg !4 {
+entry:
+  %cmp25 = icmp sgt i32 %number, 0, !dbg !10
+  br i1 %cmp25, label %for.body.preheader, label %for.end15, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.cond5.preheader:                              ; preds = %for.body
+  br i1 %cmp25, label %for.body7.preheader, label %for.end15, !dbg !16, !llvm.loop !18
+
+for.body7.preheader:                              ; preds = %for.cond5.preheader
+  br label %for.body7, !dbg !20
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv27 = phi i64 [ %indvars.iv.next28, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv27, !dbg !14
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !14, !tbaa !22
+  %idxprom1 = sext i32 %0 to i64, !dbg !14
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !14
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !14, !tbaa !22
+  %inc = add nsw i32 %1, 1, !dbg !14
+  store i32 %inc, i32* %arrayidx2, align 4, !dbg !14, !tbaa !22
+  %indvars.iv.next28 = add nuw nsw i64 %indvars.iv27, 1, !dbg !10
+  %lftr.wideiv29 = trunc i64 %indvars.iv.next28 to i32, !dbg !10
+  %exitcond30 = icmp eq i32 %lftr.wideiv29, %number, !dbg !10
+  br i1 %exitcond30, label %for.cond5.preheader, label %for.body, !dbg !10, !llvm.loop !12
+
+for.body7:                                        ; preds = %for.body7.preheader, %for.body7
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body7 ], [ 0, %for.body7.preheader ]
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !20
+  %2 = load i32, i32* %arrayidx9, align 4, !dbg !20, !tbaa !22
+  %idxprom10 = sext i32 %2 to i64, !dbg !20
+  %arrayidx11 = getelementptr inbounds i32, i32* %B, i64 %idxprom10, !dbg !20
+  %3 = load i32, i32* %arrayidx11, align 4, !dbg !20, !tbaa !22
+  %inc12 = add nsw i32 %3, 1, !dbg !20
+  store i32 %inc12, i32* %arrayidx11, align 4, !dbg !20, !tbaa !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !16
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !16
+  %exitcond = icmp eq i32 %lftr.wideiv, %number, !dbg !16
+  br i1 %exitcond, label %for.end15.loopexit, label %for.body7, !dbg !16, !llvm.loop !18
+
+for.end15.loopexit:                               ; preds = %for.body7
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry, %for.cond5.preheader
+  ret void, !dbg !26
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "no_array_bounds.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "no_array_bounds.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 4, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 4, column: 3, file: !1, scope: !4)
+!12 = !{!12, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !DILocation(line: 5, column: 5, scope: !15)
+!15 = distinct !DILexicalBlock(line: 4, column: 36, file: !1, scope: !11)
+!16 = !DILocation(line: 9, column: 8, scope: !17)
+!17 = distinct !DILexicalBlock(line: 9, column: 3, file: !1, scope: !4)
+!18 = !{!18, !13, !19}
+!19 = !{!"llvm.loop.vectorize.width", i32 1}
+!20 = !DILocation(line: 10, column: 5, scope: !21)
+!21 = distinct !DILexicalBlock(line: 9, column: 36, file: !1, scope: !17)
+!22 = !{!23, !23, i64 0}
+!23 = !{!"int", !24, i64 0}
+!24 = !{!"omnipotent char", !25, i64 0}
+!25 = !{!"Simple C/C++ TBAA"}
+!26 = !DILocation(line: 12, column: 1, scope: !4)
diff --git a/llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll b/llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll
new file mode 100644
index 00000000000..bfa48a2529b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no_idiv_reduction.ll
@@ -0,0 +1,24 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
+@a = common global [128 x i32] zeroinitializer, align 16
+
+;; Must not vectorize division reduction. Division is lossy.
+define i32 @g() {
+entry:
+  br label %for.body
+
+for.body:
+  ; CHECK-LABEL: @g(
+  ; CHECK-NOT: sdiv <2 x i32>
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.05 = phi i32 [ 80, %entry ], [ %div, %for.body ]
+  %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %div = sdiv i32 %r.05, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %div
+}
diff --git a/llvm/test/Transforms/LoopVectorize/no_int_induction.ll b/llvm/test/Transforms/LoopVectorize/no_int_induction.ll
new file mode 100644
index 00000000000..7e6b26cd900
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no_int_induction.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; int __attribute__((noinline)) sum_array(int *A, int n) {
+;  return std::accumulate(A, A + n, 0);
+; }
+
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
+
+;CHECK-LABEL: @sum_array(
+;CHECK: phi i64
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: ret i32
+define i32 @sum_array(i32* %A, i32 %n) nounwind uwtable readonly noinline ssp {
+  %1 = sext i32 %n to i64
+  %2 = getelementptr inbounds i32, i32* %A, i64 %1
+  %3 = icmp eq i32 %n, 0
+  br i1 %3, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+.lr.ph.i:                                         ; preds = %0, %.lr.ph.i
+  %.03.i = phi i32* [ %6, %.lr.ph.i ], [ %A, %0 ]
+  %.012.i = phi i32 [ %5, %.lr.ph.i ], [ 0, %0 ]
+  %4 = load i32, i32* %.03.i, align 4
+  %5 = add nsw i32 %4, %.012.i
+  %6 = getelementptr inbounds i32, i32* %.03.i, i64 1
+  %7 = icmp eq i32* %6, %2
+  br i1 %7, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %.lr.ph.i, %0
+  %.01.lcssa.i = phi i32 [ 0, %0 ], [ %5, %.lr.ph.i ]
+  ret i32 %.01.lcssa.i
+}
+
+; Same, but use a pointer with a different size.
+;CHECK-LABEL: @sum_array_as1(
+;CHECK: phi i16
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: ret i32
+define i32 @sum_array_as1(i32 addrspace(1)* %A, i32 %n) nounwind uwtable readonly noinline ssp {
+  %1 = sext i32 %n to i64
+  %2 = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %1
+  %3 = icmp eq i32 %n, 0
+  br i1 %3, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+.lr.ph.i:                                         ; preds = %0, %.lr.ph.i
+  %.03.i = phi i32 addrspace(1)* [ %6, %.lr.ph.i ], [ %A, %0 ]
+  %.012.i = phi i32 [ %5, %.lr.ph.i ], [ 0, %0 ]
+  %4 = load i32, i32 addrspace(1)* %.03.i, align 4
+  %5 = add nsw i32 %4, %.012.i
+  %6 = getelementptr inbounds i32, i32 addrspace(1)* %.03.i, i64 1
+  %7 = icmp eq i32 addrspace(1)* %6, %2
+  br i1 %7, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %.lr.ph.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %.lr.ph.i, %0
+  %.01.lcssa.i = phi i32 [ 0, %0 ], [ %5, %.lr.ph.i ]
+  ret i32 %.01.lcssa.i
+}
diff --git a/llvm/test/Transforms/LoopVectorize/no_outside_user.ll b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
new file mode 100644
index 00000000000..f2dd557e80a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -0,0 +1,414 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+@f = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+; It has a value that is used outside of the loop
+; and is not a recognized reduction variable "tmp17".
+; However, tmp17 is a non-header phi which is an allowed exit.
+
+; CHECK-LABEL: @test1(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+
+define i32 @test1()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; non-hdr phi depends on header phi.
+; CHECK-LABEL: @test2(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> %vec.ind
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+define i32 @test2()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ %tmp8, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; more than 2 incoming values for tmp17 phi that is used outside loop.
+; CHECK-LABEL: test3(
+; CHECK-LABEL: vector.body:
+; CHECK:          %predphi = select <2 x i1> %{{.*}}, <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+; CHECK:          %predphi1 = select <2 x i1> %{{.*}}, <2 x i32> <i32 2, i32 2>, <2 x i32> %predphi
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi1, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+define i32 @test3(i32 %N)  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  %cmp = icmp sgt i32 %tmp8, %N
+  br i1  %cmp, label %bb12, label %bb16
+
+bb12:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ], [ 2, %bb12 ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; more than one incoming value for outside user: %.lcssa
+; CHECK-LABEL: test4(
+; CHECK-LABEL: vector.body:
+; CHECK:          %predphi = select <2 x i1>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> %predphi, i32 1
+
+; CHECK-LABEL: f1.exit.loopexit.loopexit:
+; CHECK:          %tmp17.lcssa = phi i32 [ %tmp17, %bb16 ], [ [[E1]], %middle.block ]
+; CHECK-NEXT:     br label %f1.exit.loopexit
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ 2, %bb ], [ %tmp17.lcssa, %f1.exit.loopexit.loopexit ]
+define i32 @test4(i32 %N)  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  %icmp = icmp slt i32 %b.promoted, %N
+  br i1 %icmp, label %f1.exit.loopexit, label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ], [ 2, %bb ]
+  ret i32 %.lcssa
+}
+
+; non hdr phi that depends on reduction and is used outside the loop.
+; reduction phis are only allowed to have bump or reduction operations as the inside user, so we should
+; not vectorize this.
+; CHECK-LABEL: reduction_sum(
+; CHECK-NOT: <2 x i32>
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+entry:
+  %c1 = icmp sgt i32 %n, 0
+  br i1 %c1, label %header, label %._crit_edge
+
+header:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb16 ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %c9, %bb16 ], [ 0, %entry ]
+  %c2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %c3 = load i32, i32* %c2, align 4
+  %c4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %c5 = load i32, i32* %c4, align 4
+  %tmp2 = icmp sgt i32 %sum.02, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ %sum.02, %bb10 ], [ 1, %header ]
+  %c6 = trunc i64 %indvars.iv to i32
+  %c7 = add i32 %sum.02, %c6
+  %c8 = add i32 %c7, %c3
+  %c9 = add i32 %c8, %c5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %header
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %c9, %bb16 ]
+  %nonhdr.lcssa = phi i32 [ 1, %entry], [ %tmp17, %bb16 ]
+  ret i32 %sum.0.lcssa
+}
+
+; invalid cyclic dependency with header phi iv, which prevents iv from being
+; recognized as induction var.
+; cannot vectorize.
+; CHECK-LABEL: cyclic_dep_with_indvar(
+; CHECK-NOT: <2 x i32>
+define i32 @cyclic_dep_with_indvar()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %iv = phi i32 [ %ivnext, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %iv, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ %iv, %.lr.ph.i ]
+  %ivnext = add nsw i32 %tmp17, 1
+  %tmp19 = icmp slt i32 %ivnext, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+; non-reduction phi 'tmp17' used outside loop has cyclic dependence with %x.05 phi
+; cannot vectorize.
+; CHECK-LABEL: not_valid_reduction(
+; CHECK-NOT: <2 x i32>
+define i32 @not_valid_reduction(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
+  %x.05 = phi i32 [ %tmp17, %latch ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %tmp0 = load i32, i32* %arrayidx, align 4
+  %tmp2 = icmp sgt i64 %indvars.iv, 10
+  %sub = sub nsw i32 %x.05, %tmp0
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 1, %bb10 ], [ %sub, %for.body ]
+  br label %latch
+
+latch:
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %tmp17 , %latch ]
+  ret i32 %x.0.lcssa
+}
+
+
+; CHECK-LABEL: @outside_user_non_phi(
+; CHECK: %vec.ind = phi <2 x i32>
+; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i32> %vec.ind, <i32 10, i32 10>
+; CHECK: %predphi = select <2 x i1> [[CMP]], <2 x i32> <i32 1, i32 1>, <2 x i32> zeroinitializer
+; CHECK: [[TRUNC:%[a-zA-Z0-9.]+]] = trunc <2 x i32> %predphi to <2 x i8>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i8> [[TRUNC]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ], [ [[E1]], %middle.block ]
+define i8 @outside_user_non_phi()  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp17.trunc = trunc i32 %tmp17 to i8
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i8 [ %tmp17.trunc, %bb16 ]
+  ret i8 %.lcssa
+}
+
+; CHECK-LABEL: no_vectorize_reduction_with_outside_use(
+; CHECK-NOT: <2 x i32>
+define i32 @no_vectorize_reduction_with_outside_use(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %1, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+
+; vectorize c[i] = a[i] + b[i] loop where result of c[i] is used outside the
+; loop
+; CHECK-LABEL: sum_arrays_outside_use(
+; CHECK-LABEL: vector.memcheck:
+; CHECK:         br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph  
+
+; CHECK-LABEL: vector.body:
+; CHECK:          %wide.load = load <2 x i32>, <2 x i32>*
+; CHECK:          %wide.load16 = load <2 x i32>, <2 x i32>* 
+; CHECK:          [[ADD:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> %wide.load, %wide.load16
+; CHECK:          store <2 x i32>
+
+; CHECK-LABEL: middle.block:
+; CHECK:          [[E1:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL: f1.exit.loopexit:
+; CHECK:          %.lcssa = phi i32 [ %sum, %.lr.ph.i ], [ [[E1]], %middle.block ]
+define i32 @sum_arrays_outside_use(i32* %B, i32* %A, i32* %C, i32 %N)  {
+bb:
+  %b.promoted = load i32, i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %iv = phi i32 [ %ivnext, %.lr.ph.i ], [ %b.promoted, %bb ]
+  %indvars.iv = sext i32 %iv to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %Bload = load i32, i32* %arrayidx2, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %Aload = load i32, i32* %arrayidx, align 4
+  %sum = add nsw i32 %Bload, %Aload
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  store i32 %sum, i32* %arrayidx3, align 4
+  %ivnext = add nsw i32 %iv, 1
+  %tmp19 = icmp slt i32 %ivnext, %N
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %sum, %.lr.ph.i ]
+  ret i32 %.lcssa
+}
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+; CHECK-LABEL: non_uniform_live_out()
+; CHECK-LABEL:   vector.body:
+; CHECK:           %vec.ind = phi <2 x i32> [ <i32 0, i32 1>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:           [[ADD:%[a-zA-Z0-9.]+]] = add <2 x i32> %vec.ind, <i32 7, i32 7> 
+; CHECK:           [[EE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 0 
+; CHECK:           [[GEP:%[a-zA-Z0-9.]+]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[EE]]
+; CHECK-NEXT:      [[GEP2:%[a-zA-Z0-9.]+]] = getelementptr inbounds i8, i8* [[GEP]], i32 0
+; CHECK-NEXT:      [[BC:%[a-zA-Z0-9.]+]] = bitcast i8* [[GEP2]] to <2 x i8>*
+; CHECK-NEXT:      %wide.load = load <2 x i8>, <2 x i8>* [[BC]]
+; CHECK-NEXT:      [[ADD2:%[a-zA-Z0-9.]+]] = add <2 x i8> %wide.load, <i8 1, i8 1> 
+; CHECK:           store <2 x i8> [[ADD2]], <2 x i8>*
+
+; CHECK-LABEL:  middle.block:
+; CHECK:           [[ADDEE:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[ADD]], i32 1
+
+; CHECK-LABEL:  for.end:
+; CHECK:           %lcssa = phi i32 [ %i.09, %for.body ], [ [[ADDEE]], %middle.block ]
+; CHECK:           %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+define i32 @non_uniform_live_out() {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %i.09 = add i32 %i.08, 7
+ %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.09
+ %0 = load i8, i8* %arrayidx, align 1
+ %bump = add i8 %0, 1
+ store i8 %bump, i8* %arrayidx, align 1
+ %inc = add nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %i.08, 20000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ %lcssa = phi i32 [%i.09, %for.body]
+ %arrayidx.out = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %lcssa
+ store i8 42, i8* %arrayidx.out, align 1
+ ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/no_switch.ll b/llvm/test/Transforms/LoopVectorize/no_switch.ll
new file mode 100644
index 00000000000..3976463e766
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no_switch.ll
@@ -0,0 +1,93 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -transform-warning -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO
+
+; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
+; CHECK: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; NOANALYSIS-NOT: remark: {{.*}}
+; NOANALYSIS: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; MOREINFO: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
+; MOREINFO: remark: source.cpp:4:5: loop not vectorized (Force=true, Vector Width=4)
+; MOREINFO: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; CHECK: _Z11test_switchPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp18 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp18, label %for.body.preheader, label %for.end, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !14
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !14, !tbaa !16
+  switch i32 %0, label %for.inc [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb3
+  ], !dbg !14
+
+sw.bb:                                            ; preds = %for.body
+  %1 = trunc i64 %indvars.iv to i32, !dbg !20
+  %mul = shl nsw i32 %1, 1, !dbg !20
+  br label %for.inc, !dbg !22
+
+sw.bb3:                                           ; preds = %for.body
+  %2 = trunc i64 %indvars.iv to i32, !dbg !23
+  store i32 %2, i32* %arrayidx, align 4, !dbg !23, !tbaa !16
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %sw.bb3, %for.body, %sw.bb
+  %storemerge = phi i32 [ %mul, %sw.bb ], [ 0, %for.body ], [ 0, %sw.bb3 ]
+  store i32 %storemerge, i32* %arrayidx, align 4, !dbg !20, !tbaa !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !10
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !10, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !24
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test_switch", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 3, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!12 = !{!12, !13, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+!14 = !DILocation(line: 4, column: 5, scope: !15)
+!15 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !11)
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C/C++ TBAA"}
+!20 = !DILocation(line: 6, column: 7, scope: !21)
+!21 = distinct !DILexicalBlock(line: 4, column: 18, file: !1, scope: !15)
+!22 = !DILocation(line: 7, column: 5, scope: !21)
+!23 = !DILocation(line: 9, column: 7, scope: !21)
+!24 = !DILocation(line: 14, column: 1, scope: !4)
diff --git a/llvm/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll b/llvm/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll
new file mode 100644
index 00000000000..424ef384622
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no_switch_disable_vectorization.ll
@@ -0,0 +1,95 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -transform-warning -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -transform-warning -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO
+
+; This test is a copy of no_switch.ll, with the "llvm.loop.vectorize.enable" metadata set to false.
+; It tests that vectorization is explicitly disabled and no warnings are emitted.
+
+; CHECK-NOT: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement
+; CHECK-NOT: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; NOANALYSIS-NOT: remark: {{.*}}
+; NOANALYSIS-NOT: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; MOREINFO: remark: source.cpp:4:5: loop not vectorized: vectorization is explicitly disabled
+; MOREINFO-NOT: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; CHECK: _Z11test_switchPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z11test_switchPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp18 = icmp sgt i32 %Length, 0, !dbg !10
+  br i1 %cmp18, label %for.body.preheader, label %for.end, !dbg !10, !llvm.loop !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !14
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !14
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !14, !tbaa !16
+  switch i32 %0, label %for.inc [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb3
+  ], !dbg !14
+
+sw.bb:                                            ; preds = %for.body
+  %1 = trunc i64 %indvars.iv to i32, !dbg !20
+  %mul = shl nsw i32 %1, 1, !dbg !20
+  br label %for.inc, !dbg !22
+
+sw.bb3:                                           ; preds = %for.body
+  %2 = trunc i64 %indvars.iv to i32, !dbg !23
+  store i32 %2, i32* %arrayidx, align 4, !dbg !23, !tbaa !16
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %sw.bb3, %for.body, %sw.bb
+  %storemerge = phi i32 [ %mul, %sw.bb ], [ 0, %for.body ], [ 0, %sw.bb3 ]
+  store i32 %storemerge, i32* %arrayidx, align 4, !dbg !20, !tbaa !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !10
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !10
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !10
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !10, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !24
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test_switch", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0"}
+!10 = !DILocation(line: 3, column: 8, scope: !11)
+!11 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!12 = !{!12, !13, !13}
+!13 = !{!"llvm.loop.vectorize.enable", i1 false}
+!14 = !DILocation(line: 4, column: 5, scope: !15)
+!15 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !11)
+!16 = !{!17, !17, i64 0}
+!17 = !{!"int", !18, i64 0}
+!18 = !{!"omnipotent char", !19, i64 0}
+!19 = !{!"Simple C/C++ TBAA"}
+!20 = !DILocation(line: 6, column: 7, scope: !21)
+!21 = distinct !DILexicalBlock(line: 4, column: 18, file: !1, scope: !15)
+!22 = !DILocation(line: 7, column: 5, scope: !21)
+!23 = !DILocation(line: 9, column: 7, scope: !21)
+!24 = !DILocation(line: 14, column: 1, scope: !4)
diff --git a/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll b/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll
new file mode 100644
index 00000000000..233d530dc10
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/noalias-md-licm.ll
@@ -0,0 +1,59 @@
+; RUN: opt -basicaa -scoped-noalias -loop-vectorize -licm -force-vector-width=2 \
+; RUN:     -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; In order to vectorize the inner loop, it needs to be versioned with
+; memchecks between {A} x {B, C} first:
+;
+;   for (i = 0; i < n; i++)
+;     for (j = 0; j < m; j++)
+;         A[j] += B[i] + C[j];
+;
+; Since in the versioned vector loop A and B can no longer alias, B[i] can be
+; LICM'ed from the inner loop.
+
+
+define void @f(i32* %a, i32* %b, i32* %c) {
+entry:
+  br label %outer
+
+outer:
+  %i.2 = phi i64 [ 0, %entry ], [ %i, %inner.end ]
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %i.2
+  br label %inner.ph
+
+inner.ph:
+; CHECK: vector.ph:
+; CHECK: load i32, i32* %arrayidxB,
+; CHECK: br label %vector.body
+  br label %inner
+
+inner:
+  %j.2 = phi i64 [ 0, %inner.ph ], [ %j, %inner ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %j.2
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %j.2
+  %loadC = load i32, i32* %arrayidxC, align 4
+
+  %add = add nuw i32 %loadA, %loadB
+  %add2 = add nuw i32 %add, %loadC
+
+  store i32 %add2, i32* %arrayidxA, align 4
+
+  %j = add nuw nsw i64 %j.2, 1
+  %cond1 = icmp eq i64 %j, 20
+  br i1 %cond1, label %inner.end, label %inner
+
+inner.end:
+  %i = add nuw nsw i64 %i.2, 1
+  %cond2 = icmp eq i64 %i, 30
+  br i1 %cond2, label %outer.end, label %outer
+
+outer.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/noalias-md.ll b/llvm/test/Transforms/LoopVectorize/noalias-md.ll
new file mode 100644
index 00000000000..787ea88f945
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/noalias-md.ll
@@ -0,0 +1,78 @@
+; RUN: opt -basicaa -loop-vectorize -force-vector-width=2 \
+; RUN:     -force-vector-interleave=1 -S < %s \
+; RUN:     | FileCheck %s -check-prefix=BOTH -check-prefix=LV
+; RUN: opt -basicaa -scoped-noalias -loop-vectorize -dse -force-vector-width=2 \
+; RUN:     -force-vector-interleave=1 -S < %s \
+; RUN:     | FileCheck %s -check-prefix=BOTH -check-prefix=DSE
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; This loop needs to be versioned with memchecks between {A, B} x {C} before
+; it can be vectorized.
+;
+;   for (i = 0; i < n; i++) {
+;     C[i] = A[i] + 1;
+;     C[i] += B[i];
+;   }
+;
+; Check that the corresponding noalias metadata is added to the vector loop
+; but not to the scalar loop.
+;
+; Since in the versioned vector loop C and B can no longer alias, the first
+; store to C[i] can be DSE'd.
+
+
+define void @f(i32* %a, i32* %b, i32* %c) {
+entry:
+  br label %for.body
+
+; BOTH: vector.memcheck:
+; BOTH: vector.body:
+for.body:                                         ; preds = %for.body, %entry
+  %ind = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+
+  %arrayidxA = getelementptr inbounds i32, i32* %a, i64 %ind
+; Scope 1
+; LV: = load {{.*}} !alias.scope !0
+  %loadA = load i32, i32* %arrayidxA, align 4
+
+  %add = add nuw i32 %loadA, 2
+
+  %arrayidxC = getelementptr inbounds i32, i32* %c, i64 %ind
+; Noalias with scope 1 and 6
+; LV: store {{.*}} !alias.scope !3, !noalias !5
+; DSE-NOT: store
+  store i32 %add, i32* %arrayidxC, align 4
+
+  %arrayidxB = getelementptr inbounds i32, i32* %b, i64 %ind
+; Scope 6
+; LV: = load {{.*}} !alias.scope !7
+  %loadB = load i32, i32* %arrayidxB, align 4
+
+  %add2 = add nuw i32 %add, %loadB
+
+; Noalias with scope 1 and 6
+; LV: store {{.*}} !alias.scope !3, !noalias !5
+; DSE: store
+  store i32 %add2, i32* %arrayidxC, align 4
+
+  %inc = add nuw nsw i64 %ind, 1
+  %exitcond = icmp eq i64 %inc, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+; BOTH: for.body:
+; BOTH-NOT: !alias.scope
+; BOTH-NOT: !noalias
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; LV: !0 = !{!1}
+; LV: !1 = distinct !{!1, !2}
+; LV: !2 = distinct !{!2, !"LVerDomain"}
+; LV: !3 = !{!4}
+; LV: !4 = distinct !{!4, !2}
+; LV: !5 = !{!1, !6}
+; LV: !6 = distinct !{!6, !2}
+; LV: !7 = !{!6}
diff --git a/llvm/test/Transforms/LoopVectorize/nofloat.ll b/llvm/test/Transforms/LoopVectorize/nofloat.ll
new file mode 100644
index 00000000000..6e4e95fae00
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/nofloat.ll
@@ -0,0 +1,28 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; Make sure that we don't vectorize functions with 'noimplicitfloat' attributes.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example12(
+;CHECK-NOT: store <4 x i32>
+;CHECK: ret void
+define void @example12() noimplicitfloat { ;           <--------- "noimplicitfloat" attribute here!
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/non-const-n.ll b/llvm/test/Transforms/LoopVectorize/non-const-n.ll
new file mode 100644
index 00000000000..d909bc81668
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/non-const-n.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example1(
+;CHECK: shl i32
+;CHECK: zext i32
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i32 %n) nounwind uwtable ssp {
+  %n4 = shl i32 %n, 2
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n4
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/nontemporal.ll b/llvm/test/Transforms/LoopVectorize/nontemporal.ll
new file mode 100644
index 00000000000..b5719e12e12
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/nontemporal.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @foo(
+define void @foo(float* noalias %a, float* noalias %b, float* noalias %c, i32 %N) {
+entry:
+  %cmp.4 = icmp sgt i32 %N, 0
+  br i1 %cmp.4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+
+; Check that we don't lose !nontemporal hint when vectorizing loads.
+; CHECK: %wide.load{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !nontemporal !0
+
+; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
+; CHECK: %wide.load{{[0-9]+}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}}
+  %arrayidx2 = getelementptr inbounds float, float* %c, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+
+; Check that we don't lose !nontemporal hint when vectorizing stores.
+; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0
+  %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx4, align 4, !nontemporal !0
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+; CHECK: ret void
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
new file mode 100644
index 00000000000..5b3658fb0de
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4
+
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @test() {
+entry:
+  br i1 undef, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:
+  br label %while.body
+
+while.body:
+  %it.sroa.0.091 = phi i32* [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
+  %incdec.ptr.i = getelementptr inbounds i32, i32* %it.sroa.0.091, i64 1
+  %inc32 = add i32 undef, 1                                        ; <------------- Make sure we don't set NSW flags to the undef.
+  %cmp.i11 = icmp eq i32* %incdec.ptr.i, undef
+  br i1 %cmp.i11, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/opt.ll b/llvm/test/Transforms/LoopVectorize/opt.ll
new file mode 100644
index 00000000000..e0a928f89c8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/opt.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -O3 -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck --check-prefix=LOOPVEC %s
+; RUN: opt -S -O3 -disable-loop-vectorization -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck --check-prefix=NOLOOPVEC %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can disable vectorization in opt.
+
+; LOOPVEC:       add <2 x i32>
+; NOLOOPVEC-NOT: add <2 x i32>
+
+define i32 @vect(i32* %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %red.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
new file mode 100644
index 00000000000..403c006eeb5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -0,0 +1,102 @@
+; This test verifies that the loop vectorizer will NOT produce a tail
+; loop with the optimize for size or the minimize size attributes.
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -pgso -S | FileCheck %s -check-prefix=PGSO
+; RUN: opt < %s -loop-vectorize -pgso=false -S | FileCheck %s -check-prefix=NPGSO
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NOT: <2 x i8>
+; CHECK-NOT: <4 x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NOT: <2 x i8>
+; CHECK-NOT: <4 x i8>
+; CHECK-LABEL: @foo_pgso(
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
+define i32 @foo_pgso() !prof !14 {
+; PGSO-LABEL: @foo_pgso(
+; PGSO-NOT: <{{[0-9]+}} x i8>
+; NPGSO-LABEL: @foo_pgso(
+; NPGSO: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
new file mode 100644
index 00000000000..386c9ec40d0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test1.ll
@@ -0,0 +1,82 @@
+; extern int arr[8][8];
+; extern int arr2[8];
+;
+; void foo(int n)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (i1 = 0; i1 < 8; i1++) {
+;     arr2[i1] = i1;
+;     for (i2 = 0; i2 < 8; i2++)
+;       arr[i2][i1] = i1 + n;
+;   }
+; }
+;
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+@arr2 = external global [8 x i32], align 16
+@arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
new file mode 100644
index 00000000000..18eca4c9589
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/outer_loop_test2.ll
@@ -0,0 +1,112 @@
+; int A[1024], B[1024];
+;
+; void foo(int iCount, int c, int jCount)
+; {
+;
+;   int i, j;
+;
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (i = 0; i < iCount; i++) {
+;     A[i] = c;
+;     for (j = 0; j < jCount; j++) {
+;       A[i] += B[j] + i;
+;     }
+;   }
+; }
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path < %s | FileCheck %s
+; CHECK: %[[ZeroTripChk:.*]] = icmp sgt i32 %jCount, 0
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[CVal0:.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+; CHECK-NEXT: %[[CSplat:.*]] = shufflevector <4 x i32> %[[CVal0]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: %[[ZVal0:.*]] = insertelement <4 x i1> undef, i1 %[[ZeroTripChk]], i32 0
+; CHECK-NEXT: %[[ZSplat:.*]] = shufflevector <4 x i1> %[[ZVal0]], <4 x i1> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[CSplat]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[ZCmpExtr:.*]] = extractelement <4 x i1> %[[ZSplat]], i32 0
+; CHECK: br i1 %[[ZCmpExtr]], label %[[InnerForPh:.*]], label %[[OuterInc:.*]]
+
+; CHECK: [[InnerForPh]]:
+; CHECK: %[[WideAVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: br label %[[InnerForBody:.*]]
+
+; CHECK: [[InnerForBody]]:
+; CHECK: %[[InnerInd:.*]] = phi <4 x i64> [ %[[InnerIndNext:.*]], %[[InnerForBody]] ], [ zeroinitializer, %[[InnerForPh]] ]
+; CHECK: %[[AccumPhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext:.*]], %[[InnerForBody]] ], [ %[[WideAVal]], %[[InnerForPh]] ]
+; CHECK: %[[BAddr:.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, <4 x i64> %[[InnerInd]]
+; CHECK: %[[WideBVal:.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %[[BAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK: %[[Add1:.*]] = add nsw <4 x i32> %[[WideBVal]], %[[VecIndTr]]
+; CHECK: %[[AccumPhiNext]] = add nsw <4 x i32> %[[Add1]], %[[AccumPhi]]
+; CHECK: %[[InnerIndNext]] = add nuw nsw <4 x i64> %[[InnerInd]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[InnerVecCond:.*]] = icmp eq <4 x i64> %[[InnerIndNext]], {{.*}}
+; CHECK: %[[InnerCond:.+]] = extractelement <4 x i1> %[[InnerVecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[InnerCrit:.*]], label %[[InnerForBody]]
+
+; CHECK: [[InnerCrit]]:
+; CHECK: %[[StorePhi:.*]] = phi <4 x i32> [ %[[AccumPhiNext]], %[[InnerForBody]] ]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StorePhi]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK:  br label %[[ForInc]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], {{.*}}
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+@A = common global [1024 x i32] zeroinitializer, align 16
+@B = common global [1024 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %iCount, i32 %c, i32 %jCount) {
+entry:
+  %cmp22 = icmp sgt i32 %iCount, 0
+  br i1 %cmp22, label %for.body.lr.ph, label %for.end11
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp220 = icmp sgt i32 %jCount, 0
+  %wide.trip.count = zext i32 %jCount to i64
+  %wide.trip.count27 = zext i32 %iCount to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc9, %for.body.lr.ph
+  %indvars.iv25 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next26, %for.inc9 ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv25
+  store i32 %c, i32* %arrayidx, align 4
+  br i1 %cmp220, label %for.body3.lr.ph, label %for.inc9
+
+for.body3.lr.ph:                                  ; preds = %for.body
+  %arrayidx.promoted = load i32, i32* %arrayidx, align 4
+  %0 = trunc i64 %indvars.iv25 to i32
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %1 = phi i32 [ %arrayidx.promoted, %for.body3.lr.ph ], [ %add8, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %2, %0
+  %add8 = add nsw i32 %add, %1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond1.for.inc9_crit_edge, label %for.body3
+
+for.cond1.for.inc9_crit_edge:                     ; preds = %for.body3
+  store i32 %add8, i32* %arrayidx, align 4
+  br label %for.inc9
+
+for.inc9:                                         ; preds = %for.cond1.for.inc9_crit_edge, %for.body
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond28 = icmp eq i64 %indvars.iv.next26, %wide.trip.count27
+  br i1 %exitcond28, label %for.end11, label %for.body, !llvm.loop !1
+
+for.end11:                                        ; preds = %for.inc9, %entry
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
new file mode 100644
index 00000000000..1306ed971c4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/partial-lcssa.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+; We vectorize the inner loop, so we have to put it in LCSSA form.
+; However, there's no reason to touch the outer loop.
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: for.end.inner.loopexit:
+; CHECK: %[[LCSSAPHI:.*]] = phi i64 [ %indvars.iv, %for.body.inner ], [ %{{.*}}, %middle.block ]
+; CHECK: store i64 %[[LCSSAPHI]], i64* %O1, align 4
+; CHECK-LABEL: for.end.outer.loopexit
+; CHECK: store i64 %indvars.outer, i64* %O2, align 4
+
+
+define i64 @foo(i32* nocapture %A, i32* nocapture %B, i64 %n, i64 %m, i64* %O1, i64* %O2) {
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body.outer.preheader, label %for.end.outer
+
+for.body.outer.preheader:                         ; preds = %entry
+  br label %for.body.outer
+
+for.body.outer:                                   ; preds = %for.body.outer.preheader, %for.end.inner
+  %indvars.outer = phi i64 [ %indvars.outer.next, %for.end.inner ], [ 0, %for.body.outer.preheader ]
+  %cmp2 = icmp sgt i64 %m, 0
+  br i1 %cmp2, label %for.body.inner.preheader, label %for.end.inner
+
+for.body.inner.preheader:                         ; preds = %for.body.outer
+  br label %for.body.inner
+
+for.body.inner:                                   ; preds = %for.body.inner.preheader, %for.body.inner
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body.inner ], [ 0, %for.body.inner.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %v = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, %n
+  br i1 %exitcond, label %for.end.inner.loopexit, label %for.body.inner
+
+for.end.inner.loopexit:                           ; preds = %for.body.inner
+  store i64 %indvars.iv, i64 *%O1, align 4
+  br label %for.end.inner
+
+for.end.inner:                                    ; preds = %for.end.inner.loopexit, %for.body.outer
+  %indvars.outer.next = add i64 %indvars.outer, 1
+  %exitcond.outer = icmp eq i64 %indvars.outer, %m
+  br i1 %exitcond.outer, label %for.end.outer.loopexit, label %for.body.outer
+
+for.end.outer.loopexit:                           ; preds = %for.end.inner
+  store i64 %indvars.outer, i64 *%O2, align 4
+  br label %for.end.outer
+
+for.end.outer:                                    ; preds = %for.end.outer.loopexit, %entry
+  ret i64 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/phi-cost.ll b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
new file mode 100644
index 00000000000..057c55d55d3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/phi-cost.ll
@@ -0,0 +1,86 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: phi_two_incoming_values
+; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK:         [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{.*}}
+; CHECK:         [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i1> [[TMP5]] to <2 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = add <2 x i32> [[WIDE_LOAD]], [[TMP6]]
+; CHECK:         store <2 x i32> [[PREDPHI]], <2 x i32>* {{.*}}
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+;
+define void @phi_two_incoming_values(i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = icmp sgt i32 %tmp1, 0
+  br i1 %tmp3, label %if.then, label %if.end
+
+if.then:
+  %tmp4 = add i32 %tmp1, 1
+  br label %if.end
+
+if.end:
+  %tmp5 = phi i32 [ %tmp1, %for.body ], [ %tmp4, %if.then ]
+  store i32 %tmp5, i32* %tmp2, align 4
+  %i.next = add i64 %i, 1
+  %cond = icmp eq i64 %i, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: phi_three_incoming_values
+; CHECK:       LV: Found an estimated cost of 1 for VF 2 For instruction: %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK:         [[PREDPHI:%.*]] = select <2 x i1> {{.*}}, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 9, i32 9>
+; CHECK:         [[PREDPHI7:%.*]] = select <2 x i1> {{.*}}, <2 x i32> {{.*}}, <2 x i32> [[PREDPHI]]
+; CHECK:         store <2 x i32> [[PREDPHI7]], <2 x i32>* {{.*}}
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+;
+define void @phi_three_incoming_values(i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %if.end ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = load i32, i32* %tmp2, align 4
+  %tmp4 = icmp sgt i32 %tmp1, %tmp3
+  br i1 %tmp4, label %if.then, label %if.end
+
+if.then:
+  %tmp5 = icmp sgt i32 %tmp1, 19
+  br i1 %tmp5, label %if.end, label %if.else
+
+if.else:
+  %tmp6 = icmp slt i32 %tmp3, 4
+  %tmp7 = select i1 %tmp6, i32 4, i32 5
+  br label %if.end
+
+if.end:
+  %tmp8 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %tmp7, %if.else ]
+  store i32 %tmp8, i32* %tmp0, align 4
+  %i.next = add i64 %i, 1
+  %cond = icmp eq i64 %i, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/phi-hang.ll b/llvm/test/Transforms/LoopVectorize/phi-hang.ll
new file mode 100644
index 00000000000..eb1aaeffde8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/phi-hang.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -loop-vectorize < %s
+
+; PR15384
+define void @test1(i32 %arg) {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb5, %bb
+  %tmp = phi i32 [ 1, %bb ], [ %tmp7, %bb5 ]
+  %tmp2 = phi i32 [ %arg, %bb ], [ %tmp9, %bb5 ]
+  br i1 true, label %bb5, label %bb3
+
+bb3:                                              ; preds = %bb1
+  br label %bb4
+
+bb4:                                              ; preds = %bb3
+  br label %bb5
+
+bb5:                                              ; preds = %bb4, %bb1
+  %tmp6 = phi i32 [ 0, %bb4 ], [ %tmp, %bb1 ]
+  %tmp7 = phi i32 [ 0, %bb4 ], [ %tmp, %bb1 ]
+  %tmp8 = phi i32 [ 0, %bb4 ], [ %tmp, %bb1 ]
+  %tmp9 = add nsw i32 %tmp2, 1
+  %tmp10 = icmp eq i32 %tmp9, 0
+  br i1 %tmp10, label %bb11, label %bb1
+
+bb11:                                             ; preds = %bb5
+  ret void
+}
+
+; PR15748
+define void @test2() {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb1 ]
+  %tmp2 = phi i32 [ 0, %bb ], [ 1, %bb1 ]
+  %tmp3 = phi i32 [ 0, %bb ], [ %tmp4, %bb1 ]
+  %tmp4 = or i32 %tmp2, %tmp3
+  %tmp5 = add nsw i32 %tmp, 1
+  %tmp6 = icmp eq i32 %tmp5, 0
+  br i1 %tmp6, label %bb7, label %bb1
+
+bb7:                                              ; preds = %bb1
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr25281.ll b/llvm/test/Transforms/LoopVectorize/pr25281.ll
new file mode 100644
index 00000000000..8f48ba5d4c2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr25281.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s  -scev-aa -loop-vectorize -print-alias-sets -S  -o - 2>&1 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR25281
+; Just check that we don't crash on this test.
+; CHECK-LABEL: @foo
+define void @foo(float** noalias nocapture readonly %in, i32* noalias nocapture readonly %isCompressed, float* noalias nocapture readonly %out) {
+entry_block:
+  %tmp = getelementptr float*, float** %in, i32 0
+  %in_0 = load float*, float** %tmp, !alias.scope !0
+  %tmp1 = getelementptr i32, i32* %isCompressed, i32 0
+  %isCompressed_0 = load i32, i32* %tmp1, !alias.scope !1
+  %tmp2 = getelementptr float*, float** %in, i32 1
+  %in_1 = load float*, float** %tmp2, !alias.scope !2
+  %tmp3 = getelementptr i32, i32* %isCompressed, i32 1
+  %isCompressed_1 = load i32, i32* %tmp3, !alias.scope !3
+  br label %for_each_frames
+
+for_each_frames:
+  %frameIndex = phi i32 [ 0, %entry_block ], [ %nextFrameIndex, %for_each_frames_end ]
+  %nextFrameIndex = add nuw nsw i32 %frameIndex, 2
+  br label %for_each_channel
+
+for_each_channel:
+  %channelIndex = phi i32 [ 0, %for_each_frames ], [ %nextChannelIndex, %for_each_channel ]
+  %nextChannelIndex = add nuw nsw i32 %channelIndex, 1
+  %tmp4 = add i32 %frameIndex, %channelIndex
+  %tmp5 = xor i32 %isCompressed_0, 1
+  %tmp6 = mul i32 %frameIndex, %tmp5
+  %offset0 = add i32 %tmp6, %channelIndex
+  %tmp7 = getelementptr float, float* %in_0, i32 %offset0
+  %in_0_index = load float, float* %tmp7, align 4, !alias.scope !4
+  %tmp8 = xor i32 %isCompressed_1, 1
+  %tmp9 = mul i32 %frameIndex, %tmp8
+  %offset1 = add i32 %tmp9, %channelIndex
+  %tmp10 = getelementptr float, float* %in_1, i32 %offset1
+  %in_1_index = load float, float* %tmp10, align 4, !alias.scope !5
+  %tmp11 = fadd float %in_0_index, %in_1_index
+  %tmp12 = getelementptr float, float* %out, i32 %tmp4
+  store float %tmp11, float* %tmp12, align 4, !alias.noalias !6
+  %tmp13 = icmp eq i32 %nextChannelIndex, 2
+  br i1 %tmp13, label %for_each_frames_end, label %for_each_channel
+
+for_each_frames_end:
+  %tmp14 = icmp eq i32 %nextFrameIndex, 512
+  br i1 %tmp14, label %return, label %for_each_frames
+
+return:
+  ret void
+}
+
+!0 = distinct !{!0}
+!1 = distinct !{!1, !0}
+!2 = distinct !{!2, !0}
+!3 = distinct !{!3, !0}
+!4 = distinct !{!4, !0}
+!5 = distinct !{!5, !0}
+!6 = !{!2, !3, !4, !5, !1}
diff --git a/llvm/test/Transforms/LoopVectorize/pr28541.ll b/llvm/test/Transforms/LoopVectorize/pr28541.ll
new file mode 100644
index 00000000000..7bb7f0987fd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr28541.ll
@@ -0,0 +1,71 @@
+; RUN: opt -loop-vectorize -pass-remarks=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; FIXME: Check for -pass-remarks-missed and -pass-remarks-analysis output when
+; addAcyclicInnerLoop emits analysis.
+
+; Check that opt does not crash on such input:
+;
+; a, b, c;
+; fn1() {
+;   while (b--) {
+;     c = a;
+;     switch (a & 3)
+;     case 0:
+;       do
+;     case 3:
+;     case 2:
+;     case 1:
+;         ;
+;         while (--c)
+;           ;
+;   }
+; }
+
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+@c = common global i32 0, align 4
+
+; CHECK-NOT: vectorized loop
+; CHECK-LABEL: fn1
+
+define i32 @fn1() {
+entry:
+  %tmp2 = load i32, i32* @b, align 4
+  %dec3 = add nsw i32 %tmp2, -1
+  store i32 %dec3, i32* @b, align 4
+  %tobool4 = icmp eq i32 %tmp2, 0
+  br i1 %tobool4, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  %tmp1 = load i32, i32* @a, align 4
+  %and = and i32 %tmp1, 3
+  %switch = icmp eq i32 %and, 0
+  br label %while.body
+
+while.cond:                                       ; preds = %do.cond
+  %dec = add nsw i32 %dec7, -1
+  %tobool = icmp eq i32 %dec7, 0
+  br i1 %tobool, label %while.cond.while.end_crit_edge, label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.cond
+  %dec7 = phi i32 [ %dec3, %while.body.lr.ph ], [ %dec, %while.cond ]
+  br i1 %switch, label %do.body, label %do.cond
+
+do.body:                                          ; preds = %do.cond, %while.body
+  %dec25 = phi i32 [ %dec2, %do.cond ], [ %tmp1, %while.body ]
+  br label %do.cond
+
+do.cond:                                          ; preds = %do.body, %while.body
+  %dec26 = phi i32 [ %dec25, %do.body ], [ %tmp1, %while.body ]
+  %dec2 = add nsw i32 %dec26, -1
+  %tobool3 = icmp eq i32 %dec2, 0
+  br i1 %tobool3, label %while.cond, label %do.body
+
+while.cond.while.end_crit_edge:                   ; preds = %while.cond
+  store i32 0, i32* @c, align 4
+  store i32 -1, i32* @b, align 4
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
new file mode 100644
index 00000000000..d9c9632be04
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr30654-phiscev-sext-trunc.ll
@@ -0,0 +1,241 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the vectorizer identifies the %p.09 phi,
+; as an induction variable, despite the potential overflow
+; due to the truncation from 32bit to 8bit. 
+; SCEV will detect the pattern "sext(trunc(%p.09)) + %step"
+; and generate the required runtime checks under which
+; we can assume no overflow. We check here that we generate
+; exactly two runtime checks:
+; 1) an overflow check:
+;    {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nssw>
+; 2) an equality check verifying that the step of the induction 
+;    is equal to sext(trunc(step)): 
+;    Equal predicate: %step == (sext i8 (trunc i32 %step to i8) to i32)
+; 
+; See also pr30654.
+;
+; int a[N];
+; void doit1(int n, int step) {
+;   int i;
+;   char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+; 
+
+; CHECK-LABEL: @doit1
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow
+; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]]
+; CHECK: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+@a = common local_unnamed_addr global [250 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit1(i32 %n, i32 %step) local_unnamed_addr {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                    
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                  
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %sext = shl i32 %p.09, 24
+  %conv = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                        
+  br label %for.end
+
+for.end:                                
+  ret void
+}
+
+; Same as above, but for checking the SCEV "zext(trunc(%p.09)) + %step".
+; Here we expect the following two predicates to be added for runtime checking:
+; 1) {0,+,(trunc i32 %step to i8)}<%for.body> Added Flags: <nusw>
+; 2) Equal predicate: %step == (sext i8 (trunc i32 %step to i8) to i32)
+;
+; int a[N];
+; void doit2(int n, int step) {
+;   int i;
+;   unsigned char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+; 
+
+; CHECK-LABEL: @doit2
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %[[TEST:[0-9]+]] = or i1 {{.*}}, %mul.overflow
+; CHECK: %[[NTEST:[0-9]+]] = or i1 false, %[[TEST]]
+; CHECK: %[[EXT:[0-9]+]] = sext i8 {{.*}} to i32
+; CHECK: %ident.check = icmp ne i32 {{.*}}, %[[EXT]]
+; CHECK: %{{.*}} = or i1 %[[NTEST]], %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit2(i32 %n, i32 %step) local_unnamed_addr  {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                             
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                      
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.09 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %conv = and i32 %p.09, 255
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                        
+  br label %for.end
+
+for.end:                                
+  ret void
+}
+
+; Here we check that the same phi scev analysis would fail 
+; to create the runtime checks because the step is not invariant.
+; As a result vectorization will fail.
+;
+; int a[N];
+; void doit3(int n, int step) {
+;   int i;
+;   char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;      step += 2;
+;   }
+; }
+;
+
+; CHECK-LABEL: @doit3
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit3(i32 %n, i32 %step) local_unnamed_addr {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.012 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %step.addr.010 = phi i32 [ %add3, %for.body ], [ %step, %for.body.preheader ]
+  %sext = shl i32 %p.012, 24
+  %conv = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step.addr.010
+  %add3 = add nsw i32 %step.addr.010, 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Lastly, we also check the case where we can tell at compile time that
+; the step of the induction is equal to sext(trunc(step)), in which case
+; we don't have to check this equality at runtime (we only need the
+; runtime overflow check). Therefore only the following overflow predicate
+; will be added for runtime checking:
+; {0,+,%cstep}<%for.body> Added Flags: <nssw>
+;
+; a[N];
+; void doit4(int n, char cstep) {
+;   int i;
+;   char p = 0;
+;   int istep = cstep;
+;  for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + istep;
+;   }
+; }
+
+; CHECK-LABEL: @doit4
+; CHECK: vector.scevcheck
+; CHECK: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: %{{.*}} = or i1 {{.*}}, %mul.overflow
+; CHECK-NOT: %ident.check = icmp ne i32 {{.*}}, %{{.*}}
+; CHECK-NOT: %{{.*}} = or i1 %{{.*}}, %ident.check
+; CHECK-NOT: %mul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 {{.*}}, i8 {{.*}})
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit4(i32 %n, i8 signext %cstep) local_unnamed_addr {
+entry:
+  %conv = sext i8 %cstep to i32
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %p.011 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %sext = shl i32 %p.011, 24
+  %conv2 = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv2, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv2, %conv
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll b/llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll
new file mode 100644
index 00000000000..e04bdd6607d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr30806-phi-scev.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; Produced from the test-case:
+;
+; extern void foo(char *, unsigned , unsigned *);
+; extern void bar(int *, long);
+; extern char *processBuf(char *);
+;
+; extern unsigned theSize;
+;
+; void foo(char *buf, unsigned denominator, unsigned *flag) {
+;   int incr = (int) (theSize / denominator);
+;   int inx = 0;
+;   while (*flag) {
+;     int itmp = inx + incr;
+;     int i = (int) theSize;
+;     bar(&i, (long) itmp);
+;     buf = processBuf(buf);
+;     inx = itmp;
+;   }
+; }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@theSize = external local_unnamed_addr global i32, align 4
+
+define void @foo(i8* %buf, i32 %denominator, i32* %flag) local_unnamed_addr {
+entry:
+  %i = alloca i32, align 4
+  %0 = load i32, i32* @theSize, align 4
+  %div = udiv i32 %0, %denominator
+  %1 = load i32, i32* %flag, align 4
+  %tobool5 = icmp eq i32 %1, 0
+  br i1 %tobool5, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  %2 = bitcast i32* %i to i8*
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+; Check that there are two PHIs followed by a 'sext' in the same block, and that
+; the test does not crash.
+; CHECK:        phi
+; CHECK-NEXT:   phi
+; CHECK-NEXT:   sext
+  %buf.addr.07 = phi i8* [ %buf, %while.body.lr.ph ], [ %call, %while.body ]
+  %inx.06 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
+  %add = add nsw i32 %inx.06, %div
+  %3 = load i32, i32* @theSize, align 4
+  store i32 %3, i32* %i, align 4
+  %conv = sext i32 %add to i64
+  call void @bar(i32* nonnull %i, i64 %conv)
+  %call = call i8* @processBuf(i8* %buf.addr.07)
+  %4 = load i32, i32* %flag, align 4
+  %tobool = icmp eq i32 %4, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  ret void
+}
+
+declare void @bar(i32*, i64) local_unnamed_addr
+declare i8* @processBuf(i8*) local_unnamed_addr
diff --git a/llvm/test/Transforms/LoopVectorize/pr30806.ll b/llvm/test/Transforms/LoopVectorize/pr30806.ll
new file mode 100644
index 00000000000..dd9f6629413
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr30806.ll
@@ -0,0 +1,65 @@
+; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Produced from test-case:
+;
+; void testGuardedInnerLoop(uint32_t *ptr, uint32_t denom, uint32_t numer, uint32_t outer_lim)
+; {
+;   for(uint32_t outer_i = 0; outer_i < outer_lim; ++outer_i) {
+;     if (denom > 0) {
+;       const uint32_t lim = numer / denom;
+;
+;       for (uint32_t i = 0; i < lim; ++i)
+;         ptr[i] = 1;
+;     }
+;   }
+; }
+
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @testGuardedInnerLoop(i32* %ptr, i32 %denom, i32 %numer, i32 %outer_lim) {
+entry:
+  %cmp1 = icmp eq i32 %outer_lim, 0
+  br i1 %cmp1, label %exit, label %loop1.preheader
+
+; Verify that a 'udiv' does not appear between the 'loop1.preheader' label, and
+; whatever label comes next.
+loop1.preheader:
+; CHECK-LABEL: loop1.preheader:
+; CHECK-NOT: udiv
+; CHECK-LABEL: :
+  br label %loop1
+
+loop1:
+  %outer_i = phi i32 [ %inc1, %loop2.exit ], [ 0, %loop1.preheader ]
+  %0 = add i32 %denom, -1
+  %1 = icmp ult i32 %0, %numer
+  br i1 %1, label %loop2.preheader, label %loop2.exit
+
+; Verify that a 'udiv' does appear between the 'loop2.preheader' label, and
+; whatever label comes next.
+loop2.preheader:
+; CHECK-LABEL: loop2.preheader:
+; CHECK: udiv
+; CHECK-LABEL: :
+  %lim = udiv i32 %numer, %denom
+  %2 = zext i32 %lim to i64
+  br label %loop2
+
+loop2:
+  %indvar.loop2 = phi i64 [ 0, %loop2.preheader ], [ %indvar.loop2.next, %loop2 ]
+  %arrayidx = getelementptr inbounds i32, i32* %ptr, i64 %indvar.loop2
+  store i32 1, i32* %arrayidx, align 4
+  %indvar.loop2.next = add nuw nsw i64 %indvar.loop2, 1
+  %cmp2 = icmp ult i64 %indvar.loop2.next, %2
+  br i1 %cmp2, label %loop2, label %loop2.exit
+
+loop2.exit:
+  %inc1 = add nuw i32 %outer_i, 1
+  %exitcond = icmp eq i32 %inc1, %outer_lim
+  br i1 %exitcond, label %exit, label %loop1
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr31098.ll b/llvm/test/Transforms/LoopVectorize/pr31098.ll
new file mode 100644
index 00000000000..368a948557c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr31098.ll
@@ -0,0 +1,100 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -debug-only=loop-accesses < %s  2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the compile-time-unknown depenendece-distance is resolved 
+; statically. Due to the non-unit stride of the accesses in this testcase
+; we are currently not able to create runtime dependence checks, and therefore
+; if we don't resolve the dependence statically we cannot vectorize the loop.
+;
+; Specifically in this example, during dependence analysis we get 6 unknown 
+; dependence distances between the 8 real/imaginary accesses below: 
+;    dist = 8*D, 4+8*D, -4+8*D, -8*D, 4-8*D, -4-8*D.
+; At compile time we can prove for all of the above that |dist|>loopBound*step
+; (where the step is 8bytes, and the loopBound is D-1), and thereby conclude 
+; that there are no dependencies (without runtime tests):
+; |8*D|>8*D-8, |4+8*D|>8*D-8, |-4+8*D|>8*D-8, etc.
+
+; #include <stdlib.h>
+; class Complex {
+; private:
+;   float real_;
+;   float imaginary_;
+;
+; public:
+;   Complex() : real_(0), imaginary_(0) { }
+;   Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { }
+;   Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { }
+; 
+;   inline float real() const { return real_; }
+;   inline float imaginary() const { return imaginary_; }
+; 
+;   Complex operator+(const Complex& rhs) const
+;   {
+;    return Complex(real_ + rhs.real_, imaginary_ + rhs.imaginary_);
+;   }
+;
+;   Complex operator-(const Complex& rhs) const
+;  {
+;     return Complex(real_ - rhs.real_, imaginary_ - rhs.imaginary_);
+;   }
+; };
+;
+; void Test(Complex *out, size_t size)
+; {
+;     size_t D = size / 2;
+;     for (size_t offset = 0; offset < D; ++offset)
+;     {
+;         Complex t0 = out[offset];
+;         Complex t1 = out[offset + D];
+;         out[offset] = t1 + t0;
+;         out[offset + D] = t0 - t1;
+;     }
+; }
+
+; CHECK-LABEL: Test
+; CHECK: LAA: No unsafe dependent memory operations in loop.  We don't need runtime memory checks.
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+%class.Complex = type { float, float }
+
+define void @Test(%class.Complex* nocapture %out, i64 %size) local_unnamed_addr {
+entry:
+  %div = lshr i64 %size, 1
+  %cmp47 = icmp eq i64 %div, 0
+  br i1 %cmp47, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %offset.048 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %0 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 0
+  %1 = load float, float* %0, align 4
+  %imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 1
+  %2 = load float, float* %imaginary_.i.i, align 4
+  %add = add nuw i64 %offset.048, %div
+  %3 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 0
+  %4 = load float, float* %3, align 4
+  %imaginary_.i.i28 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 1
+  %5 = load float, float* %imaginary_.i.i28, align 4
+  %add.i = fadd fast float %4, %1
+  %add4.i = fadd fast float %5, %2
+  store float %add.i, float* %0, align 4
+  store float %add4.i, float* %imaginary_.i.i, align 4
+  %sub.i = fsub fast float %1, %4
+  %sub4.i = fsub fast float %2, %5
+  store float %sub.i, float* %3, align 4
+  store float %sub4.i, float* %imaginary_.i.i28, align 4
+  %inc = add nuw nsw i64 %offset.048, 1
+  %exitcond = icmp eq i64 %inc, %div
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr31190.ll b/llvm/test/Transforms/LoopVectorize/pr31190.ll
new file mode 100644
index 00000000000..c9790be8220
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr31190.ll
@@ -0,0 +1,63 @@
+; RUN: opt -passes='loop-vectorize' -debug -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; This checks we don't crash when the inner loop we're trying to vectorize
+; is a SCEV AddRec with respect to an outer loop.
+
+; In this case, the problematic PHI is:
+; %0 = phi i32 [ undef, %for.cond1.preheader ], [ %inc54, %for.body3 ]
+; Since %inc54 is the IV of the outer loop, and %0 equivalent to it,
+; we get the situation described above.
+
+; Code that leads to this situation can look something like:
+;
+; int a, b[1], c;
+; void fn1 ()
+; {
+;  for (; c; c++)
+;    for (a = 0; a; a++)
+;      b[c] = 4;
+; }
+;
+; The PHI is an artifact of the register promotion of c.
+
+; Note that we can no longer get the vectorizer to actually see such PHIs,
+; because LV now simplifies the loop internally, but the test is still
+; useful as a regression test, and in case loop-simplify behavior changes.
+
+@c = external global i32, align 4
+@a = external global i32, align 4
+@b = external global [1 x i32], align 4
+
+; We can vectorize this loop because we are storing an invariant value into an
+; invariant address.
+
+; CHECK: LV: We can vectorize this loop!
+; CHECK-LABEL: @test
+define void @test() {
+entry:
+  %a.promoted2 = load i32, i32* @a, align 1
+  %c.promoted = load i32, i32* @c, align 1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.for.inc4_crit_edge, %entry
+  %inc54 = phi i32 [ %inc5, %for.cond1.for.inc4_crit_edge ], [ %c.promoted, %entry ]
+  %inc.lcssa3 = phi i32 [ %inc.lcssa, %for.cond1.for.inc4_crit_edge ], [ %a.promoted2, %entry ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %inc1 = phi i32 [ %inc.lcssa3, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %0 = phi i32 [ undef, %for.cond1.preheader ], [ %inc54, %for.body3 ]
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [1 x i32], [1 x i32]* @b, i64 0, i64 %idxprom
+  store i32 4, i32* %arrayidx, align 4
+  %inc = add nsw i32 %inc1, 1
+  %tobool2 = icmp eq i32 %inc, 0
+  br i1 %tobool2, label %for.cond1.for.inc4_crit_edge, label %for.body3
+
+for.cond1.for.inc4_crit_edge:                     ; preds = %for.body3
+  %inc.lcssa = phi i32 [ %inc, %for.body3 ]
+  %.lcssa = phi i32 [ %inc54, %for.body3 ]
+  %inc5 = add nsw i32 %.lcssa, 1
+  br label %for.cond1.preheader
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
new file mode 100644
index 00000000000..31cb84699f7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
+; but the IR Verifier requires for PHI one entry for each predecessor of
+; it's parent basic block. The original PR14725 solution for the issue just
+; added 'undef' for an predecessor BB and which is not correct. We copy the real
+; value for another predecessor instead of bringing 'undef'.
+
+; CHECK-LABEL: for.cond.preheader:
+; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
+
+; Function Attrs: nounwind uwtable
+define void @main() #0 {
+entry:
+  br label %for.cond1.preheader.i
+
+for.cond1.preheader.i:                            ; preds = %if.end.2.i, %entry
+  %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ]
+  %tobool.i = icmp ne i32 undef, 0
+  br label %if.end.2.i
+
+if.end.2.i:                                       ; preds = %for.cond1.preheader.i
+  %inc5.i = add nsw i32 %c.06.i, 1
+  %cmp.i = icmp slt i32 %inc5.i, 16
+  br i1 %cmp.i, label %for.cond1.preheader.i, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.end.2.i
+  %e.0.ph = phi i32 [ 0, %if.end.2.i ]
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr33706.ll b/llvm/test/Transforms/LoopVectorize/pr33706.ll
new file mode 100644
index 00000000000..b9d0d8a44ac
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr33706.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck %s
+
+@global = local_unnamed_addr global i32 0, align 4
+@global.1 = local_unnamed_addr global i32 0, align 4
+@global.2 = local_unnamed_addr global float 0x3EF0000000000000, align 4
+
+; CHECK-LABEL: @PR33706
+; CHECK-NOT: <2 x i32>
+define void @PR33706(float* nocapture readonly %arg, float* nocapture %arg1, i32 %arg2) local_unnamed_addr {
+bb:
+  %tmp = load i32, i32* @global.1, align 4
+  %tmp3 = getelementptr inbounds float, float* %arg, i64 190
+  %tmp4 = getelementptr inbounds float, float* %arg1, i64 512
+  %tmp5 = and i32 %tmp, 65535
+  %tmp6 = icmp ugt i32 %arg2, 65536
+  br i1 %tmp6, label %bb7, label %bb9
+
+bb7:                                              ; preds = %bb
+  %tmp8 = load i32, i32* @global, align 4
+  br label %bb27
+
+bb9:                                              ; preds = %bb
+  %tmp10 = udiv i32 65536, %arg2
+  br label %bb11
+
+bb11:                                             ; preds = %bb11, %bb9
+  %tmp12 = phi i32 [ %tmp20, %bb11 ], [ %tmp5, %bb9 ]
+  %tmp13 = phi float* [ %tmp18, %bb11 ], [ %tmp4, %bb9 ]
+  %tmp14 = phi i32 [ %tmp16, %bb11 ], [ %tmp10, %bb9 ]
+  %tmp15 = phi i32 [ %tmp19, %bb11 ], [ %tmp, %bb9 ]
+  %tmp16 = add nsw i32 %tmp14, -1
+  %tmp17 = sitofp i32 %tmp12 to float
+  store float %tmp17, float* %tmp13, align 4
+  %tmp18 = getelementptr inbounds float, float* %tmp13, i64 1
+  %tmp19 = add i32 %tmp15, %arg2
+  %tmp20 = and i32 %tmp19, 65535
+  %tmp21 = icmp eq i32 %tmp16, 0
+  br i1 %tmp21, label %bb22, label %bb11
+
+bb22:                                             ; preds = %bb11
+  %tmp23 = phi float* [ %tmp18, %bb11 ]
+  %tmp24 = phi i32 [ %tmp19, %bb11 ]
+  %tmp25 = phi i32 [ %tmp20, %bb11 ]
+  %tmp26 = ashr i32 %tmp24, 16
+  store i32 %tmp26, i32* @global, align 4
+  br label %bb27
+
+bb27:                                             ; preds = %bb22, %bb7
+  %tmp28 = phi i32 [ %tmp26, %bb22 ], [ %tmp8, %bb7 ]
+  %tmp29 = phi float* [ %tmp23, %bb22 ], [ %tmp4, %bb7 ]
+  %tmp30 = phi i32 [ %tmp25, %bb22 ], [ %tmp5, %bb7 ]
+  %tmp31 = sext i32 %tmp28 to i64
+  %tmp32 = getelementptr inbounds float, float* %tmp3, i64 %tmp31
+  %tmp33 = load float, float* %tmp32, align 4
+  %tmp34 = sitofp i32 %tmp30 to float
+  %tmp35 = load float, float* @global.2, align 4
+  %tmp36 = fmul float %tmp35, %tmp34
+  %tmp37 = fadd float %tmp33, %tmp36
+  store float %tmp37, float* %tmp29, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr34681.ll b/llvm/test/Transforms/LoopVectorize/pr34681.ll
new file mode 100644
index 00000000000..e93265e2ed5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr34681.ll
@@ -0,0 +1,122 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check the scenario where we have an unknown Stride, which happens to also be
+; the loop iteration count, so if we specialize the loop for the Stride==1 case,
+; this also implies that the loop will iterate no more than a single iteration,
+; as in the following example: 
+;
+;       unsigned int N;
+;       int tmp = 0;
+;       for(unsigned int k=0;k<N;k++) {
+;         tmp+=(int)B[k*N+j];
+;       }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+;   %ident.check = icmp ne i32 %N, 1
+;   %0 = or i1 false, %ident.check
+;   br i1 %0, label %scalar.ph, label %vector.ph
+; Instead the loop is vectorized with an unknown stride.
+
+; CHECK-LABEL: @foo1
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i32 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+
+define i32 @foo1(i32 %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j)  {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add1, %for.body ]
+  %k.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul i32 %k.09, %N
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %add1 = add nsw i32 %tmp.010, %conv
+  %inc = add nuw i32 %k.09, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add1.lcssa = phi i32 [ %add1, %for.body ]
+  br label %for.end
+
+for.end: 
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add1.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
+
+
+; Check the same, but also where the Stride and the loop iteration count
+; are not of the same data type. 
+;
+;       unsigned short N;
+;       int tmp = 0;
+;       for(unsigned int k=0;k<N;k++) {
+;         tmp+=(int)B[k*N+j];
+;       }
+;
+; We check here that the following runtime scev guard for Stride==1 is NOT generated:
+; vector.scevcheck:
+; %ident.check = icmp ne i16 %N, 1
+; %0 = or i1 false, %ident.check
+; br i1 %0, label %scalar.ph, label %vector.ph
+
+
+; CHECK-LABEL: @foo2
+; CHECK: for.body.lr.ph
+; CHECK-NOT: %ident.check = icmp ne i16 %N, 1
+; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check
+; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph
+; CHECK: vector.ph
+; CHECK: vector.body
+; CHECK: <4 x i32>
+; CHECK: middle.block
+; CHECK: scalar.ph
+
+define i32 @foo2(i16 zeroext %N, i16* nocapture readnone %A, i16* nocapture readonly %B, i32 %i, i32 %j) {
+entry:
+  %conv = zext i16 %N to i32
+  %cmp11 = icmp eq i16 %N, 0
+  br i1 %cmp11, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %tmp.013 = phi i32 [ 0, %for.body.lr.ph ], [ %add4, %for.body ]
+  %k.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw i32 %k.012, %conv
+  %add = add i32 %mul, %j
+  %arrayidx = getelementptr inbounds i16, i16* %B, i32 %add
+  %0 = load i16, i16* %arrayidx, align 2
+  %conv3 = sext i16 %0 to i32
+  %add4 = add nsw i32 %tmp.013, %conv3
+  %inc = add nuw nsw i32 %k.012, 1
+  %exitcond = icmp eq i32 %inc, %conv
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  %add4.lcssa = phi i32 [ %add4, %for.body ]
+  br label %for.end
+
+for.end:
+  %tmp.0.lcssa = phi i32 [ 0, %entry ], [ %add4.lcssa, %for.end.loopexit ]
+  ret i32 %tmp.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr35743.ll b/llvm/test/Transforms/LoopVectorize/pr35743.ll
new file mode 100644
index 00000000000..7dc67e4a9b6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr35743.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s  -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This cannot be correctly vectorized with type i1.
+define i8 @test_01(i8 %c) #0 {
+
+; CHECK-LABEL: @test_01(
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   zext i1 {{.*}} to i8
+
+entry:
+  br label %loop
+
+exit:                                           ; preds = %loop
+  ret i8 %accum.plus
+
+loop:                                            ; preds = %loop, %entry
+  %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ]
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %accum.and = and i8 %accum.phi, 1
+  %accum.plus = add nuw nsw i8 %accum.and, 3
+  %iv.next = add nuw nsw i32 %iv, 1
+  %cond = icmp ugt i32 %iv, 191
+  br i1 %cond, label %exit, label %loop
+}
+
+; TODO: This can be vectorized with type i1 because the result is not used.
+define void @test_02(i8 %c) #0 {
+
+; CHECK-LABEL: @test_02(
+; CHECK-NOT:   vector.body:
+
+entry:
+  br label %loop
+
+exit:                                           ; preds = %loop
+  %lcssa = phi i8 [ %accum.plus, %loop ]
+  ret void
+
+loop:                                            ; preds = %loop, %entry
+  %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ]
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %accum.and = and i8 %accum.phi, 1
+  %accum.plus = add nuw nsw i8 %accum.and, 3
+  %iv.next = add nuw nsw i32 %iv, 1
+  %cond = icmp ugt i32 %iv, 191
+  br i1 %cond, label %exit, label %loop
+}
+
+; This can be vectorized with type i1 because the result is truncated properly.
+define i1 @test_03(i8 %c) #0 {
+
+; CHECK-LABEL: @test_03(
+; CHECK:   vector.body:
+; CHECK:   zext i1 {{.*}} to i8
+
+entry:
+  br label %loop
+
+exit:                                           ; preds = %loop
+  %lcssa = phi i8 [ %accum.plus, %loop ]
+  %trunc = trunc i8 %lcssa to i1
+  ret i1 %trunc
+
+loop:                                            ; preds = %loop, %entry
+  %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ]
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %accum.and = and i8 %accum.phi, 1
+  %accum.plus = add nuw nsw i8 %accum.and, 3
+  %iv.next = add nuw nsw i32 %iv, 1
+  %cond = icmp ugt i32 %iv, 191
+  br i1 %cond, label %exit, label %loop
+}
+
+; This cannot be vectorized with type i1 because the result is truncated to a
+; wrong type.
+; TODO: It can also be vectorized with type i32 (or maybe i4?)
+define i4 @test_04(i8 %c) #0 {
+
+; CHECK-LABEL: @test_04(
+; CHECK-NOT:   vector.body:
+; CHECK-NOT:   zext i1 {{.*}} to i8
+
+entry:
+  br label %loop
+
+exit:                                           ; preds = %loop
+  %lcssa = phi i8 [ %accum.plus, %loop ]
+  %trunc = trunc i8 %lcssa to i4
+  ret i4 %trunc
+
+loop:                                            ; preds = %loop, %entry
+  %accum.phi = phi i8 [ %c, %entry ], [ %accum.plus, %loop ]
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop ]
+  %accum.and = and i8 %accum.phi, 1
+  %accum.plus = add nuw nsw i8 %accum.and, 3
+  %iv.next = add nuw nsw i32 %iv, 1
+  %cond = icmp ugt i32 %iv, 191
+  br i1 %cond, label %exit, label %loop
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr35773.ll b/llvm/test/Transforms/LoopVectorize/pr35773.ll
new file mode 100644
index 00000000000..362ece70b89
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr35773.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+@a = common local_unnamed_addr global i32 0, align 4
+@b = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit1() local_unnamed_addr{
+entry:
+  br label %for.body
+
+for.body:
+  %main.iv = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+
+  %i8.iv = phi i8 [ 0, %entry ], [ %i8.add, %for.body ]
+  %i32.iv = phi i32 [ 0, %entry ], [ %i32.add, %for.body ]
+
+  %trunc.to.be.converted.to.new.iv = trunc i32 %i32.iv to i8
+  %i8.add = add i8 %i8.iv, %trunc.to.be.converted.to.new.iv
+
+  %noop.conv.under.pse = and i32 %i32.iv, 255
+  %i32.add = add nuw nsw i32 %noop.conv.under.pse, 9
+
+  %inc = add i32 %main.iv, 1
+  %tobool = icmp eq i32 %inc, 16
+  br i1 %tobool, label %for.cond.for.end_crit_edge, label %for.body
+
+; CHECK-LABEL: @doit1(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[MAIN_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[MAIN_IV_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[I8_IV:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[I8_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[I32_IV:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV_FROM_TRUNC:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[MAIN_IV]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0
+
+; CHECK-NEXT:    [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
+
+; CHECK-NEXT:    [[MAIN_IV_NEXT]] = add i32 [[MAIN_IV]], 4
+; CHECK-NEXT:    [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], <i32 36, i32 36, i32 36, i32 36>
+; CHECK-NEXT:    [[IV_FROM_TRUNC_NEXT]] = add <4 x i8> [[IV_FROM_TRUNC]], <i8 36, i8 36, i8 36, i8 36>
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[MAIN_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+
+for.cond.for.end_crit_edge:
+  store i8 %i8.add, i8* @b, align 1
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr36311.ll b/llvm/test/Transforms/LoopVectorize/pr36311.ll
new file mode 100644
index 00000000000..ba48924f9de
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr36311.ll
@@ -0,0 +1,49 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -S < %s
+;
+; Cleaned up version of fe_tools.all_dimensions.ll from PR36311.
+; Forcing VF=2 to trigger vector code gen
+;
+; This is a test case that let's vectorizer's code gen to modify CFG and get
+; DomTree out of date, such that an assert from SCEV would trigger if
+; reanalysis of SCEV happens subsequently. Once vector code gen starts,
+; vectorizer should not invoke recomputation of Analysis.
+
+$test = comdat any
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: uwtable
+define dso_local void @test() local_unnamed_addr #0 comdat align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  br label %for.body51
+
+for.body51:                                       ; preds = %for.body51, %entry
+  br i1 undef, label %for.body51, label %for.body89.lr.ph
+
+for.cond80.loopexit:                              ; preds = %for.body89
+  %inc94.lcssa = phi i32 [ %inc94, %for.body89 ]
+  br i1 undef, label %for.body89.lr.ph, label %nrvo.skipdtor.loopexit
+
+for.body89.lr.ph:                                 ; preds = %for.cond80.loopexit, %for.body51
+  %i79.0179 = phi i32 [ %add90, %for.cond80.loopexit ], [ 0, %for.body51 ]
+  %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ undef, %for.body51 ]
+  %add90 = add nuw i32 %i79.0179, 1
+  %mul91 = mul i32 %add90, undef
+  br label %for.body89
+
+for.body89:                                       ; preds = %for.body89, %for.body89.lr.ph
+  %j.0175 = phi i32 [ 0, %for.body89.lr.ph ], [ %add92, %for.body89 ]
+  %next_index.5174 = phi i32 [ %next_index.4178, %for.body89.lr.ph ], [ %inc94, %for.body89 ]
+  %add92 = add nuw i32 %j.0175, 1
+  %add93 = add i32 %add92, %mul91
+  %inc94 = add i32 %next_index.5174, 1
+  %conv95 = zext i32 %next_index.5174 to i64
+  %arrayidx.i160 = getelementptr inbounds i32, i32* undef, i64 %conv95
+  store i32 %add93, i32* %arrayidx.i160, align 4
+;, !tbaa !1
+  %cmp87 = icmp ult i32 %add92, undef
+  br i1 %cmp87, label %for.body89, label %for.cond80.loopexit
+
+nrvo.skipdtor.loopexit:                           ; preds = %for.cond80.loopexit
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr36983.ll b/llvm/test/Transforms/LoopVectorize/pr36983.ll
new file mode 100644
index 00000000000..71b17192255
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr36983.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; There could be more than one LCSSA PHIs in loop exit block.
+
+; CHECK-LABEL: bb1.bb3_crit_edge:
+; CHECK: %_tmp133.lcssa1 = phi i16 [ %scalar.recur, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ]
+; CHECK: %_tmp133.lcssa = phi i16 [ %scalar.recur, %bb2 ], [ %vector.recur.extract.for.phi, %middle.block ]
+
+define void @f1() {
+bb2.lr.ph:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb2.lr.ph
+  %_tmp132 = phi i16 [ 0, %bb2.lr.ph ], [ %_tmp10, %bb2 ]
+  %_tmp133 = phi i16 [ undef, %bb2.lr.ph ], [ %_tmp10, %bb2 ]
+  %_tmp10 = sub nsw i16 %_tmp132, 1
+  %_tmp15 = icmp ne i16 %_tmp10, 0
+  br i1 %_tmp15, label %bb2, label %bb1.bb3_crit_edge
+
+bb1.bb3_crit_edge:                                ; preds = %bb2
+  %_tmp133.lcssa1 = phi i16 [ %_tmp133, %bb2 ]
+  %_tmp133.lcssa = phi i16 [ %_tmp133, %bb2 ]
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
new file mode 100644
index 00000000000..c9d22bec660
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -0,0 +1,42 @@
+; RUN: opt -passes='loop-vectorize' -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Forcing VF=2 to trigger vector code gen
+;
+; This is a test case that let's vectorizer's code gen to generate
+; more than one BasicBlocks in the loop body (emulated masked scatter)
+; for those targets that do not support masked scatter. Broadcast
+; code generation was previously dependent on loop body being
+; a single basic block and this test case exposed incorrect code gen
+; resulting in an assert in IL verification. Test passes if IL verification
+; does not fail.
+;
+; Performing minimal check in the output to ensure the loop is actually
+; vectorized.
+;
+; CHECK: vector.body
+
+@a = external global [2 x i16], align 1
+
+define void @f1() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %land.end, %entry
+  %0 = phi i32 [ undef, %entry ], [ %dec, %land.end ]
+  br i1 undef, label %land.end, label %land.rhs
+
+land.rhs:                                         ; preds = %for.body
+  %1 = load i32, i32* undef, align 1
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %for.body
+  %2 = trunc i32 %0 to i16
+  %arrayidx = getelementptr inbounds [2 x i16], [2 x i16]* @a, i16 0, i16 %2
+  store i16 undef, i16* %arrayidx, align 1
+  %dec = add nsw i32 %0, -1
+  %cmp = icmp sgt i32 %0, 1
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %land.end
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr37515.ll b/llvm/test/Transforms/LoopVectorize/pr37515.ll
new file mode 100644
index 00000000000..b09e11fe15e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr37515.ll
@@ -0,0 +1,20 @@
+; RUN: opt -passes='loop-vectorize' -S -pass-remarks-missed=loop-vectorize < %s 2>&1 | FileCheck %s
+;
+; FP primary induction is not supported in LV. Make sure Legal bails out.
+;
+; CHECK: loop not vectorized
+
+define void @PR37515() {
+entry:
+  br label %loop
+
+loop:
+  %p = phi float [ 19.0, %entry ], [ %a, %loop ]
+  %a = fadd fast float %p, -1.0
+  %m = fmul fast float %a, %a
+  %c = fcmp fast ugt float %a, 2.0
+  br i1 %c, label %loop, label %exit
+
+exit:
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr38800.ll b/llvm/test/Transforms/LoopVectorize/pr38800.ll
new file mode 100755
index 00000000000..d3e937b5b76
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr38800.ll
@@ -0,0 +1,34 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -pass-remarks-missed='loop-vectorize' -S < %s 2>&1 | FileCheck %s
+
+; CHECK: remark: <unknown>:0:0: loop not vectorized: integer loop induction variable could not be identified
+
+; Test-case ('-O2 -ffast-math') from PR38800.
+; (Set '-force-vector-width=2' to enable vector code generation.)
+;
+; No integral induction variable in the source-code caused a compiler-crash
+; when attempting to vectorize.  With the fix, a remark indicating why it
+; wasn't vectorized is produced
+;
+;void foo(float *ptr, float val) {
+;  float f;
+;  for (f = 0.1f; f < 1.0f; f += 0.01f)
+;    *ptr += val;
+;}
+
+define void @foo(float* nocapture %ptr, float %val) local_unnamed_addr {
+entry:
+  %ptr.promoted = load float, float* %ptr, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %add5 = phi float [ %ptr.promoted, %entry ], [ %add, %for.body ]
+  %f.04 = phi float [ 0x3FB99999A0000000, %entry ], [ %add1, %for.body ]
+  %add = fadd fast float %add5, %val
+  %add1 = fadd fast float %f.04, 0x3F847AE140000000
+  %cmp = fcmp fast olt float %add1, 1.000000e+00
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  store float %add, float* %ptr, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr39099.ll b/llvm/test/Transforms/LoopVectorize/pr39099.ll
new file mode 100644
index 00000000000..2f75e23aaa6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr39099.ll
@@ -0,0 +1,42 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; Ensure that we don't create interleave groups for predicated
+; strided accesses. 
+
+; CHECK: LV: Checking a loop in "masked_strided"
+; CHECK: LV: Analyzing interleaved accesses...
+; CHECK-NOT: LV: Creating an interleave group
+
+define dso_local void @masked_strided(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.017 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.017, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.017, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %0, i8* %arrayidx4, align 1
+  %sub = sub i8 0, %0
+  %add = or i32 %mul, 1
+  %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx8, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.017, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
new file mode 100644
index 00000000000..6032fb18a38
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; PR39417
+; Check that the need for overflow check prevents vectorizing a loop with tiny
+; trip count (which implies opt for size).
+; CHECK-LABEL: @func_34
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: bb67:
+define void @func_34() {
+bb1:
+  br label %bb67
+
+bb67:
+  %storemerge2 = phi i32 [ 0, %bb1 ], [ %_tmp2300, %bb67 ]
+  %sext = shl i32 %storemerge2, 16
+  %_tmp2299 = ashr exact i32 %sext, 16
+  %_tmp2300 = add nsw i32 %_tmp2299, 1
+  %_tmp2310 = trunc i32 %_tmp2300 to i16
+  %_tmp2312 = icmp slt i16 %_tmp2310, 3
+  br i1 %_tmp2312, label %bb67, label %bb68
+
+bb68:
+  ret void
+}
+
+; Check that the need for stride==1 check prevents vectorizing a loop under opt
+; for size.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 {
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  ret void
+}
+
+attributes #0 = { optsize }
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
new file mode 100644
index 00000000000..8880d99290d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debugify -loop-vectorize -S | FileCheck %s -check-prefix DEBUGLOC
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; This test makes sure we don't duplicate the loop vectorizer's metadata
+; while marking them as already vectorized (by setting width = 1), even
+; at lower optimization levels, where no extra cleanup is done
+
+; DEBUGLOC-LABEL: define void @_Z3fooPf(
+; Check that the phi to resume the scalar part of the loop
+; has Debug Location.
+define void @_Z3fooPf(float* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %p = load float, float* %arrayidx, align 4
+  %mul = fmul float %p, 2.000000e+00
+  store float %mul, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+; DEBUGLOC: scalar.ph:
+; DEBUGLOC-NEXT:    %bc.resume.val = phi {{.*}} !dbg ![[DbgLoc:[0-9]+]]
+;
+; DEBUGLOC: ![[DbgLoc]] = !DILocation(line: 2
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+; CHECK-NOT: !{metadata !"llvm.loop.vectorize.width", i32 4}
+; CHECK: !{!"llvm.loop.isvectorized", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/ptr-induction.ll b/llvm/test/Transforms/LoopVectorize/ptr-induction.ll
new file mode 100644
index 00000000000..47d33352763
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ptr-induction.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; This testcase causes SCEV to return a pointer-typed exit value.
+
+; CHECK: @f
+; Expect that the pointer indvar has been converted into an integer indvar.
+; CHECK: %index.next = add i64 %index, 4
+define i32 @f(i32* readonly %a, i32* readnone %b) #0 {
+entry:
+  %cmp.6 = icmp ult i32* %a, %b
+  br i1 %cmp.6, label %while.body.preheader, label %while.end
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %a.pn = phi i32* [ %incdec.ptr8, %while.body ], [ %a, %while.body.preheader ]
+  %acc.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %incdec.ptr8 = getelementptr inbounds i32, i32* %a.pn, i64 1
+  %0 = load i32, i32* %incdec.ptr8, align 1
+  %add = add nuw nsw i32 %0, %acc.07
+  %exitcond = icmp eq i32* %incdec.ptr8, %b
+  br i1 %exitcond, label %while.cond.while.end_crit_edge, label %while.body
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %add.lcssa = phi i32 [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %entry
+  %acc.0.lcssa = phi i32 [ %add.lcssa, %while.cond.while.end_crit_edge ], [ 0, %entry ]
+  ret i32 %acc.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ptr_loops.ll b/llvm/test/Transforms/LoopVectorize/ptr_loops.ll
new file mode 100644
index 00000000000..c374b006147
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ptr_loops.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@A = global [36 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35], align 16
+@B = global [36 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35], align 16
+
+;CHECK-LABEL:@_Z5test1v(
+;CHECK: load <4 x i32>
+;CHECK: shufflevector <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @_Z5test1v() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %0, %1
+  %p.02 = phi i32* [ getelementptr inbounds ([36 x i32], [36 x i32]* @A, i64 0, i64 18), %0 ], [ %4, %1 ]
+  %b.01 = phi i32* [ getelementptr inbounds ([36 x i32], [36 x i32]* @B, i64 0, i64 0), %0 ], [ %5, %1 ]
+  %2 = load i32, i32* %b.01, align 4
+  %3 = shl nsw i32 %2, 1
+  store i32 %3, i32* %p.02, align 4
+  %4 = getelementptr inbounds i32, i32* %p.02, i64 -1
+  %5 = getelementptr inbounds i32, i32* %b.01, i64 1
+  %6 = icmp eq i32* %4, getelementptr ([36 x i32], [36 x i32]* @A, i64 128102389400760775, i64 3)
+  br i1 %6, label %7, label %1
+
+; <label>:7                                       ; preds = %1
+  ret i32 0
+}
+
+;CHECK-LABEL: @_Z5test2v(
+;CHECK: load <4 x i32>
+;CHECK: shufflevector <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @_Z5test2v() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %0, %1
+  %p.02 = phi i32* [ getelementptr inbounds ([36 x i32], [36 x i32]* @A, i64 0, i64 25), %0 ], [ %3, %1 ]
+  %b.01 = phi i32* [ getelementptr inbounds ([36 x i32], [36 x i32]* @B, i64 0, i64 2), %0 ], [ %4, %1 ]
+  %2 = load i32, i32* %b.01, align 4
+  store i32 %2, i32* %p.02, align 4
+  %3 = getelementptr inbounds i32, i32* %p.02, i64 -1
+  %4 = getelementptr inbounds i32, i32* %b.01, i64 1
+  %5 = icmp eq i32* %4, getelementptr inbounds ([36 x i32], [36 x i32]* @A, i64 0, i64 18)
+  br i1 %5, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret i32 0
+}
+
+;CHECK:_Z5test3v
+;CHECK: load <4 x i32>
+;CHECK: shufflevector <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+define i32 @_Z5test3v() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %0, %1
+  %p.02 = phi i32* [ getelementptr inbounds ([36 x i32], [36 x i32]* @A, i64 0, i64 29), %0 ], [ %3, %1 ]
+  %b.01 = phi i32* [ getelementptr inbounds ([36 x i32], [36 x i32]* @B, i64 0, i64 5), %0 ], [ %4, %1 ]
+  %2 = load i32, i32* %b.01, align 4
+  store i32 %2, i32* %p.02, align 4
+  %3 = getelementptr inbounds i32, i32* %p.02, i64 -1
+  %4 = getelementptr inbounds i32, i32* %b.01, i64 1
+  %5 = icmp eq i32* %3, getelementptr ([36 x i32], [36 x i32]* @A, i64 128102389400760775, i64 3)
+  br i1 %5, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/read-only.ll b/llvm/test/Transforms/LoopVectorize/read-only.ll
new file mode 100644
index 00000000000..921e3fcdae1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/read-only.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @read_only_func(
+;CHECK: load <4 x i32>
+;CHECK: ret i32
+define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i64 %indvars.iv, 13
+  %5 = getelementptr inbounds i32, i32* %B, i64 %4
+  %6 = load i32, i32* %5, align 4
+  %7 = shl i32 %6, 1
+  %8 = add i32 %3, %sum.02
+  %9 = add i32 %8, %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
new file mode 100644
index 00000000000..879f1c3c5ad
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -force-vector-width=4 -force-vector-interleave=1 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @PR34687(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP17:%.*]], %[[LATCH]] ]
+; CHECK:       [[LATCH]]:
+; CHECK:         [[TMP13:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 255, i32 255, i32 255, i32 255>
+; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw <4 x i32> [[TMP13]], {{.*}}
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK:         [[TMP16:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i8>
+; CHECK-NEXT:    [[TMP17]] = zext <4 x i8> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %if.end ]
+  %r = phi i32 [ 0, %entry ], [ %r.next, %if.end ]
+  br i1 %c, label %if.then, label %if.end
+
+if.then:
+  %tmp0 = sdiv i32 undef, undef
+  br label %if.end
+
+if.end:
+  %tmp1 = and i32 %r, 255
+  %i.next = add nsw i32 %i, 1
+  %r.next = add nuw nsw i32 %tmp1, %x
+  %cond = icmp eq i32 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp2 = phi i32 [ %r.next, %if.end ]
+  %tmp3 = trunc i32 %tmp2 to i8
+  ret i8 %tmp3
+}
+
+; CHECK-LABEL: @PR35734(
+; CHECK:       vector.ph:
+; CHECK:         [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 %y, i32 0
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], %vector.ph ], [ [[TMP9:%.*]], %vector.body ]
+; CHECK:         [[TMP5:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP5]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK:         [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i1>
+; CHECK-NEXT:    [[TMP9]] = sext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @PR35734(i32 %x, i32 %y) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ %x, %entry ], [ %i.next, %for.body ]
+  %r = phi i32 [ %y, %entry ], [ %r.next, %for.body ]
+  %tmp0 = and i32 %r, 1
+  %r.next = add i32 %tmp0, -1
+  %i.next = add nsw i32 %i, 1
+  %cond = icmp sgt i32 %i, 77
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp1 = phi i32 [ %r.next, %for.body ]
+  ret i32 %tmp1
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
new file mode 100644
index 00000000000..1246cd5ca82
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -0,0 +1,580 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @reduction_sum(
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK-LABEL: @reduction_prod(
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = mul i32 %prod.02, %6
+  %8 = mul i32 %7, %3
+  %9 = mul i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+;CHECK-LABEL: @reduction_mix(
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK-LABEL: @reduction_mul(
+;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: mul <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %3, %6
+  %8 = add i32 %7, %5
+  %9 = mul i32 %8, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK-LABEL: @start_at_non_zero(
+;CHECK: phi <4 x i32>
+;CHECK: <i32 120, i32 0, i32 0, i32 0>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %coeff, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.09
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK-LABEL: @reduction_and(
+;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: and <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: and <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: and <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %and = and i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK-LABEL: @reduction_or(
+;CHECK: or <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: or <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: or <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK-LABEL: @reduction_xor(
+;CHECK: xor <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: xor <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: xor <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: ret i32
+define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+; In this code the subtracted variable is on the RHS and this is not an induction variable.
+;CHECK-LABEL: @reduction_sub_rhs(
+;CHECK-NOT: phi <4 x i32>
+;CHECK-NOT: sub nsw <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sub_rhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %sub = sub nsw i32 %0, %x.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  ret i32 %x.0.lcssa
+}
+
+
+; In this test the reduction variable is on the LHS and we can vectorize it.
+;CHECK-LABEL: @reduction_sub_lhs(
+;CHECK: phi <4 x i32>
+;CHECK: sub nsw <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %x.05 = phi i32 [ %sub, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %sub = sub nsw i32 %x.05, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  ret i32 %x.0.lcssa
+}
+
+; We can vectorize conditional reductions with multi-input phis.
+; CHECK: reduction_conditional
+; CHECK: fadd fast <4 x float>
+
+define float @reduction_conditional(float* %A, float* %B, float* %C, float %S) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %cmp3 = fcmp ogt float %0, %1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:
+  %cmp6 = fcmp ogt float %1, 1.000000e+00
+  br i1 %cmp6, label %if.then8, label %if.else
+
+if.then8:
+  %add = fadd fast float %sum.033, %0
+  br label %for.inc
+
+if.else:
+  %cmp14 = fcmp ogt float %0, 2.000000e+00
+  br i1 %cmp14, label %if.then16, label %for.inc
+
+if.then16:
+  %add19 = fadd fast float %sum.033, %1
+  br label %for.inc
+
+for.inc:
+  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ %sum.033, %if.else ], [ %sum.033, %for.body ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
+  ret float %sum.1.lcssa
+}
+
+; We can't vectorize reductions with phi inputs from outside the reduction.
+; CHECK: noreduction_phi
+; CHECK-NOT: fadd <4 x float>
+define float @noreduction_phi(float* %A, float* %B, float* %C, float %S) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %sum.033 = phi float [ %S, %entry ], [ %sum.1, %for.inc ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4
+  %cmp3 = fcmp ogt float %0, %1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:
+  %cmp6 = fcmp ogt float %1, 1.000000e+00
+  br i1 %cmp6, label %if.then8, label %if.else
+
+if.then8:
+  %add = fadd fast float %sum.033, %0
+  br label %for.inc
+
+if.else:
+  %cmp14 = fcmp ogt float %0, 2.000000e+00
+  br i1 %cmp14, label %if.then16, label %for.inc
+
+if.then16:
+  %add19 = fadd fast float %sum.033, %1
+  br label %for.inc
+
+for.inc:
+  %sum.1 = phi float [ %add, %if.then8 ], [ %add19, %if.then16 ], [ 0.000000e+00, %if.else ], [ %sum.033, %for.body ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  %sum.1.lcssa = phi float [ %sum.1, %for.inc ]
+  ret float %sum.1.lcssa
+}
+
+; We can't vectorize reductions that feed another header PHI.
+; CHECK: noredux_header_phi
+; CHECK-NOT: fadd <4 x float>
+
+define float @noredux_header_phi(float* %A, float* %B, float* %C, float %S)  {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum2.09 = phi float [ 0.000000e+00, %entry ], [ %add1, %for.body ]
+  %sum.08 = phi float [ %S, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd fast float %sum.08, %0
+  %add1 = fadd fast float %sum2.09, %add
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:
+  %add1.lcssa = phi float [ %add1, %for.body ]
+  %add.lcssa = phi float [ %add, %for.body ]
+  %add2 = fadd fast float %add.lcssa, %add1.lcssa
+  ret float %add2
+}
+
+
+; When vectorizing a reduction whose loop header phi value is used outside the
+; loop special care must be taken. Otherwise, the reduced value feeding into the
+; outside user misses a few iterations (VF-1) of the loop.
+; PR16522
+
+; CHECK-LABEL: @phivalueredux(
+; CHECK-NOT: x i32>
+
+define i32 @phivalueredux(i32 %p) {
+entry:
+  br label %for.body
+
+for.body:
+  %t.03 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %p.addr.02 = phi i32 [ %p, %entry ], [ %xor, %for.body ]
+  %xor = xor i32 %p.addr.02, -1
+  %inc = add nsw i32 %t.03, 1
+  %exitcond = icmp eq i32 %inc, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %p.addr.02
+}
+
+; Don't vectorize a reduction value that is not the last in a reduction cyle. We
+; would loose iterations (VF-1) on the operations after that use.
+; PR17498
+
+; CHECK-LABEL: not_last_operation
+; CHECK-NOT: x i32>
+define i32 @not_last_operation(i32 %p, i32 %val) {
+entry:
+  %tobool = icmp eq i32 %p, 0
+  br label %for.body
+
+for.body:
+  %inc613.1 = phi i32 [ 0, %entry ], [ %inc6.1, %for.body ]
+  %inc511.1 = phi i32 [ %val, %entry ], [ %inc5.1, %for.body ]
+  %0 = zext i1 %tobool to i32
+  %inc4.1 = xor i32 %0, 1
+  %inc511.1.inc4.1 = add nsw i32 %inc511.1, %inc4.1
+  %inc5.1 = add nsw i32 %inc511.1.inc4.1, 1
+  %inc6.1 = add nsw i32 %inc613.1, 1
+  %exitcond.1 = icmp eq i32 %inc6.1, 22
+  br i1 %exitcond.1, label %exit, label %for.body
+
+exit:
+  %inc.2 = add nsw i32 %inc511.1.inc4.1, 2
+  ret i32 %inc.2
+}
+
+;CHECK-LABEL: @reduction_sum_multiuse(
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+;CHECK: add <4 x i32>
+;CHECK: extractelement <4 x i32> %{{.*}}, i32 0
+;CHECK: %sum.lcssa = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ]
+;CHECK: %sum.copy = phi i32 [ %[[SCALAR]], %.lr.ph ], [ %[[VECTOR]], %middle.block ]
+;CHECK: ret i32
+define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph.preheader, label %end
+.lr.ph.preheader:                                 ; preds = %0
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.lcssa = phi i32 [ %9, %.lr.ph ]
+  %sum.copy = phi i32 [ %9, %.lr.ph ]
+  br label %end
+
+end:
+  %f1 = phi i32 [ 0, %0 ], [ %sum.lcssa, %._crit_edge ]
+  %f2 = phi i32 [ 0, %0 ], [ %sum.copy, %._crit_edge ]
+  %final = add i32 %f1, %f2
+  ret i32 %final
+}
+
+; This looks like a predicated reduction, but it is a reset of the reduction
+; variable. We cannot vectorize this.
+; CHECK-LABEL: reduction_reset(
+; CHECK-NOT: <4 x i32>
+define void @reduction_reset(i32 %N, i32* nocapture readonly %arrayA, i32* nocapture %arrayB) { 
+entry:
+  %c4 = icmp sgt i32 %N, 0
+  br i1 %c4, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %entry
+  %c5 = add i32 %N, -1
+  %wide.trip.count = zext i32 %N to i64
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i64 [ 0, %.lr.ph.preheader ], [ %indvars.iv.next, %.lr.ph ]
+  %.017 = phi i32 [ 100, %.lr.ph.preheader ], [ %csel, %.lr.ph ]
+  %c6 = getelementptr inbounds i32, i32* %arrayA, i64 %indvars.iv
+  %c7 = load i32, i32* %c6, align 4
+  %c8 = icmp sgt i32 %c7, 0
+  %c9 = add nsw i32 %c7, %.017
+  %csel = select i1 %c8, i32 %c9, i32 0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  %csel.lcssa = phi i32 [ %csel, %.lr.ph ]
+  %phitmp19 = sext i32 %c5 to i64
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %entry
+  %.015.lcssa = phi i64 [ -1, %entry ], [ %phitmp19, %._crit_edge.loopexit ]
+  %.0.lcssa = phi i32 [ 100, %entry ], [ %csel.lcssa, %._crit_edge.loopexit ]
+  %c10 = getelementptr inbounds i32, i32* %arrayB, i64 %.015.lcssa
+  store i32 %.0.lcssa, i32* %c10, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/remove_metadata.ll b/llvm/test/Transforms/LoopVectorize/remove_metadata.ll
new file mode 100644
index 00000000000..793c134dd7d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/remove_metadata.ll
@@ -0,0 +1,32 @@
+; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+;
+; Check that llvm.loop.vectorize.* metadata is removed after vectorization.
+;
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @disable_nonforced_enable(
+; CHECK: store <2 x i32>
+define void @disable_nonforced_enable(i32* nocapture %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+!0 = !{!0, !{!"llvm.loop.vectorize.some_property"}, !{!"llvm.loop.vectorize.enable", i32 1}}
+
+; CHECK-NOT: llvm.loop.vectorize.
+; CHECK: {!"llvm.loop.isvectorized", i32 1}
+; CHECK-NOT: llvm.loop.vectorize.
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
new file mode 100644
index 00000000000..ce81e1f83fd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -0,0 +1,152 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure consecutive vector generates correct negative indices.
+; PR15882
+
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i64 %startval, %index
+; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
+; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
+
+define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %add.i7 = phi i64 [ %startval, %entry ], [ %add.i, %for.body ]
+  %i.06 = phi i32 [ 0, %entry ], [ %inc4, %for.body ]
+  %redux5 = phi i32 [ 0, %entry ], [ %inc.redux, %for.body ]
+  %add.i = add i64 %add.i7, -1
+  %kind_.i = getelementptr inbounds i32, i32* %ptr, i64 %add.i
+  %tmp.i1 = load i32, i32* %kind_.i, align 4
+  %inc.redux = add i32 %tmp.i1, %redux5
+  %inc4 = add i32 %i.06, 1
+  %exitcond = icmp ne i32 %inc4, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret i32 %inc.redux
+}
+
+; CHECK-LABEL: @reverse_induction_i128(
+; CHECK: %index = phi i128 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i128 %startval, %index
+; CHECK: %[[a0:.+]] = add i128 %offset.idx, 0
+; CHECK: %[[a4:.+]] = add i128 %offset.idx, -4
+
+define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %add.i7 = phi i128 [ %startval, %entry ], [ %add.i, %for.body ]
+  %i.06 = phi i32 [ 0, %entry ], [ %inc4, %for.body ]
+  %redux5 = phi i32 [ 0, %entry ], [ %inc.redux, %for.body ]
+  %add.i = add i128 %add.i7, -1
+  %kind_.i = getelementptr inbounds i32, i32* %ptr, i128 %add.i
+  %tmp.i1 = load i32, i32* %kind_.i, align 4
+  %inc.redux = add i32 %tmp.i1, %redux5
+  %inc4 = add i32 %i.06, 1
+  %exitcond = icmp ne i32 %inc4, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret i32 %inc.redux
+}
+
+; CHECK-LABEL: @reverse_induction_i16(
+; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i16 %startval, {{.*}}
+; CHECK: %[[a0:.+]] = add i16 %offset.idx, 0
+; CHECK: %[[a4:.+]] = add i16 %offset.idx, -4
+
+define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %add.i7 = phi i16 [ %startval, %entry ], [ %add.i, %for.body ]
+  %i.06 = phi i32 [ 0, %entry ], [ %inc4, %for.body ]
+  %redux5 = phi i32 [ 0, %entry ], [ %inc.redux, %for.body ]
+  %add.i = add i16 %add.i7, -1
+  %kind_.i = getelementptr inbounds i32, i32* %ptr, i16 %add.i
+  %tmp.i1 = load i32, i32* %kind_.i, align 4
+  %inc.redux = add i32 %tmp.i1, %redux5
+  %inc4 = add i32 %i.06, 1
+  %exitcond = icmp ne i32 %inc4, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret i32 %inc.redux
+}
+
+
+@a = common global [1024 x i32] zeroinitializer, align 16
+
+; We incorrectly transformed this loop into an empty one because we left the
+; induction variable in i8 type and truncated the exit value 1024 to 0.
+; int a[1024];
+;
+; void fail() {
+;   int reverse_induction = 1023;
+;   unsigned char forward_induction = 0;
+;   while ((reverse_induction) >= 0) {
+;     forward_induction++;
+;     a[reverse_induction] = forward_induction;
+;     --reverse_induction;
+;   }
+; }
+
+; CHECK-LABEL: @reverse_forward_induction_i64_i8(
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i64 1023, %index
+; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
+; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
+
+define void @reverse_forward_induction_i64_i8() {
+entry:
+  br label %while.body
+
+while.body:
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %while.body ]
+  %forward_induction.05 = phi i8 [ 0, %entry ], [ %inc, %while.body ]
+  %inc = add i8 %forward_induction.05, 1
+  %conv = zext i8 %inc to i32
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %0 = trunc i64 %indvars.iv to i32
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
+
+; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed(
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %offset.idx = sub i64 1023, %index
+; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0
+; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4
+
+define void @reverse_forward_induction_i64_i8_signed() {
+entry:
+  br label %while.body
+
+while.body:
+  %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %while.body ]
+  %forward_induction.05 = phi i8 [ -127, %entry ], [ %inc, %while.body ]
+  %inc = add i8 %forward_induction.05, 1
+  %conv = sext i8 %inc to i32
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %0 = trunc i64 %indvars.iv to i32
+  %cmp = icmp sgt i32 %0, 0
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reverse_iter.ll b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
new file mode 100644
index 00000000000..bd057698280
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reverse_iter.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; PR15882: This test ensures that we do not produce wrapping arithmetic when
+; creating constant reverse step vectors.
+;
+; int foo(int n, int *A) {
+;   int sum;
+;   for (int i=n; i > 0; i--)
+;     sum += A[i*2];
+;   return sum;
+; }
+;
+
+;CHECK-LABEL: @foo(
+;CHECK:  <i32 0, i32 -1, i32 -2, i32 -3>
+;CHECK: ret
+define i32 @foo(i32 %n, i32* nocapture %A) {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i32 %n to i64
+  br label %3
+
+; <label>:3                                       ; preds = %.lr.ph, %3
+  %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %sum.01 = phi i32 [ undef, %.lr.ph ], [ %9, %3 ]
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = shl nsw i32 %4, 1
+  %6 = sext i32 %5 to i64
+  %7 = getelementptr inbounds i32, i32* %A, i64 %6
+  %8 = load i32, i32* %7, align 4
+  %9 = add nsw i32 %8, %sum.01
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %10 = trunc i64 %indvars.iv.next to i32
+  %11 = icmp sgt i32 %10, 0
+  br i1 %11, label %3, label %._crit_edge
+
+._crit_edge:                                      ; preds = %3, %0
+  %sum.0.lcssa = phi i32 [ undef, %0 ], [ %9, %3 ]
+  ret i32 %sum.0.lcssa
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
new file mode 100644
index 00000000000..8e7ac1f865a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
@@ -0,0 +1,221 @@
+; RUN: opt -S -march=r600 -mcpu=cayman -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+
+; Check vectorization that would ordinarily require a runtime bounds
+; check on the pointers when mixing address spaces. For now we cannot
+; assume address spaces do not alias, and we can't assume that
+; different pointers are directly comparable.
+;
+; These all test this basic loop for different combinations of address
+; spaces, and swapping in globals or adding noalias.
+;
+;void foo(int addrspace(N)* [noalias] a, int addrspace(M)* [noalias] b, int n)
+;{
+;    for (int i = 0; i < n; ++i)
+;    {
+;        a[i] = 3 * b[i];
+;    }
+;}
+
+; Artificial datalayout
+target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+
+
+@g_as1 = common addrspace(1) global [1024 x i32] zeroinitializer, align 16
+@q_as2 = common addrspace(2) global [1024 x i32] zeroinitializer, align 16
+
+; Both parameters are unidentified objects with the same address
+; space, so this should vectorize normally.
+define void @foo(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %n) #0 {
+; CHECK-LABEL: @foo(
+; CHECK: <4 x i32>
+; CHECK: ret
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Parameters are unidentified and different address spaces, so cannot vectorize.
+define void @bar0(i32* %a, i32 addrspace(1)* %b, i32 %n) #0 {
+; CHECK-LABEL: @bar0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Swapped arguments should be the same
+define void @bar1(i32 addrspace(1)* %a, i32* %b, i32 %n) #0 {
+; CHECK-LABEL: @bar1(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; We should still be able to vectorize with noalias even if the
+; address spaces are different.
+define void @bar2(i32* noalias %a, i32 addrspace(1)* noalias %b, i32 %n) #0 {
+; CHECK-LABEL: @bar2(
+; CHECK: <4 x i32>
+; CHECK: ret
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Store to identified global with different address space. This isn't
+; generally safe and shouldn't be vectorized.
+define void @arst0(i32* %b, i32 %n) #0 {
+; CHECK-LABEL: @arst0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+; Load from identified global with different address space.
+; This isn't generally safe and shouldn't be vectorized.
+define void @arst1(i32* %b, i32 %n) #0 {
+; CHECK-LABEL: @arst1(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %idxprom1
+  store i32 %mul, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Read and write to 2 identified globals in different address
+; spaces. This should be vectorized.
+define void @aoeu(i32 %n) #0 {
+; CHECK-LABEL: @aoeu(
+; CHECK: <4 x i32>
+; CHECK: ret
+
+entry:
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(2)* @q_as2, i64 0, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %mul = mul nsw i32 %0, 3
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i64 0, i64 %idxprom1
+  store i32 %mul, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
new file mode 100644
index 00000000000..6ee983d22c6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
@@ -0,0 +1,132 @@
+; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+
+; Artificial datalayout
+target datalayout = "e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+
+
+define void @add_ints_1_1_1(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
+; CHECK-LABEL: @add_ints_1_1_1(
+; CHECK: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %i.01
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %i.01
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %i.01
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @add_ints_as_1_0_0(i32 addrspace(1)* %a, i32* %b, i32* %c) #0 {
+; CHECK-LABEL: @add_ints_as_1_0_0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.01
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.01
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %i.01
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @add_ints_as_0_1_0(i32* %a, i32 addrspace(1)* %b, i32* %c) #0 {
+; CHECK-LABEL: @add_ints_as_0_1_0(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %i.01
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.01
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.01
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @add_ints_as_0_1_1(i32* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
+; CHECK-LABEL: @add_ints_as_0_1_1(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %i.01
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %c, i64 %i.01
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.01
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @add_ints_as_0_1_2(i32* %a, i32 addrspace(1)* %b, i32 addrspace(2)* %c) #0 {
+; CHECK-LABEL: @add_ints_as_0_1_2(
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %i.01
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(2)* %c, i64 %i.01
+  %1 = load i32, i32 addrspace(2)* %arrayidx1, align 4
+  %add = add nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.01
+  store i32 %add, i32* %arrayidx2, align 4
+  %inc = add i64 %i.01, 1
+  %cmp = icmp ult i64 %inc, 200
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
new file mode 100644
index 00000000000..b37d94c0c32
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @add_ints(
+;CHECK: br
+;CHECK: getelementptr
+;CHECK-DAG: getelementptr
+;CHECK-DAG: icmp ugt
+;CHECK-DAG: icmp ugt
+;CHECK-DAG: icmp ugt
+;CHECK-DAG: icmp ugt
+;CHECK-DAG: and
+;CHECK-DAG: and
+;CHECK: br
+;CHECK: ret
+define void @add_ints(i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
new file mode 100644
index 00000000000..2a665e56ab0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we vectorize this loop:
+; int foo(float *a, float *b, int n) {
+;   for (int i=0; i<n; ++i)
+;     a[i] = b[i] * 3;
+; }
+
+define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg !4
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]], !dbg !4
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1, !dbg !9
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg !9
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg !9
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4, !dbg !9
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]], !dbg !9
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1, !dbg !9
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64, !dbg !9
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1, !dbg !9
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP5]], !dbg !9
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP5]], !dbg !9
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[A]], !dbg !9
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]], !dbg !9
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], !dbg !9
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], !dbg !9
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588, !dbg !9
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg !9
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg !9
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]], !dbg !9
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*, !dbg !9
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP8]], align 4, !dbg !9, !alias.scope !10
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, !dbg !9
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]], !dbg !9
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*, !dbg !9
+; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP11]], align 4, !dbg !9, !alias.scope !13, !noalias !10
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4, !dbg !9
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg !9
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg !9, !llvm.loop !15
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg !9
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg !9
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], !dbg !9
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]], !dbg !9
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg !9
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP13]], 3.000000e+00, !dbg !9
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]], !dbg !9
+; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX2]], align 4, !dbg !9
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1, !dbg !9
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg !9
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg !9
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !dbg !9, !llvm.loop !17
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]], !dbg !18
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef, !dbg !18
+;
+entry:
+  %cmp6 = icmp sgt i32 %n, 0, !dbg !6
+  br i1 %cmp6, label %for.body, label %for.end, !dbg !6
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ], !dbg !7
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv, !dbg !7
+  %0 = load float, float* %arrayidx, align 4, !dbg !7
+  %mul = fmul float %0, 3.000000e+00, !dbg !7
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv, !dbg !7
+  store float %mul, float* %arrayidx2, align 4, !dbg !7
+  %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !7
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !7
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !7
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !7
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef, !dbg !8
+}
+
+; Make sure that we try to vectorize loops with a runtime check if the
+; dependency check fails.
+
+; CHECK-LABEL: test_runtime_check
+; CHECK:      <4 x float>
+define void @test_runtime_check(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ind.sum = add i64 %iv, %offset
+  %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum
+  %l1 = load float, float* %arr.idx, align 4
+  %ind.sum2 = add i64 %iv, %offset2
+  %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2
+  %l2 = load float, float* %arr.idx2, align 4
+  %m = fmul fast float %b, %l2
+  %ad = fadd fast float %l1, %m
+  store float %ad, float* %arr.idx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret void
+}
+
+; Check we do not generate runtime checks if we found a known dependence preventing
+; vectorization. In this case, it is a read of c[i-1] followed by a write of c[i].
+; The runtime checks would always fail.
+
+; void test_runtime_check2(float *a, float b, unsigned offset, unsigned offset2, unsigned n, float *c) {
+;   for (unsigned i = 1; i < n; i++) {
+;     a[i+o1] += a[i+o2] + b;
+;     c[i] = c[i-1] + b;
+;   }
+; }
+;
+; CHECK-LABEL: test_runtime_check2
+; CHECK-NOT:      <4 x float>
+define void @test_runtime_check2(float* %a, float %b, i64 %offset, i64 %offset2, i64 %n, float* %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %ind.sum = add i64 %iv, %offset
+  %arr.idx = getelementptr inbounds float, float* %a, i64 %ind.sum
+  %l1 = load float, float* %arr.idx, align 4
+  %ind.sum2 = add i64 %iv, %offset2
+  %arr.idx2 = getelementptr inbounds float, float* %a, i64 %ind.sum2
+  %l2 = load float, float* %arr.idx2, align 4
+  %m = fmul fast float %b, %l2
+  %ad = fadd fast float %l1, %m
+  store float %ad, float* %arr.idx, align 4
+  %c.ind = add i64 %iv, -1
+  %c.idx = getelementptr inbounds float, float* %c, i64 %c.ind
+  %lc = load float, float* %c.idx, align 4
+  %vc = fadd float %lc, 1.0
+  %c.idx2 = getelementptr inbounds float, float* %c, i64 %iv
+  store float %vc, float* %c.idx2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %loopexit, label %for.body
+
+loopexit:
+  ret void
+}
+
+; CHECK: !9 = !DILocation(line: 101, column: 1, scope: !{{.*}})
+
+!llvm.module.flags = !{!0, !1}
+!llvm.dbg.cu = !{!9}
+!0 = !{i32 2, !"Dwarf Version", i32 4}
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+
+!2 = !{}
+!3 = !DISubroutineType(types: !2)
+!4 = !DIFile(filename: "test.cpp", directory: "/tmp")
+!5 = distinct !DISubprogram(name: "foo", scope: !4, file: !4, line: 99, type: !3, isLocal: false, isDefinition: true, scopeLine: 100, flags: DIFlagPrototyped, isOptimized: false, unit: !9, retainedNodes: !2)
+!6 = !DILocation(line: 100, column: 1, scope: !5)
+!7 = !DILocation(line: 101, column: 1, scope: !5)
+!8 = !DILocation(line: 102, column: 1, scope: !5)
+!9 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !10,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)
+!10 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
+!11 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-limit.ll b/llvm/test/Transforms/LoopVectorize/runtime-limit.ll
new file mode 100644
index 00000000000..a7f692cef17
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-limit.ll
@@ -0,0 +1,101 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=OVERRIDE
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -pragma-vectorize-memory-check-threshold=6 -dce -instcombine -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -S 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; First loop produced diagnostic pass remark.
+;CHECK: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+; Second loop produces diagnostic analysis remark.
+;CHECK: remark: {{.*}}:0:0: loop not vectorized: cannot prove it is safe to reorder memory operations
+
+; First loop produced diagnostic pass remark.
+;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+; Second loop produces diagnostic pass remark.
+;OVERRIDE: remark: {{.*}}:0:0: vectorized loop (vectorization width: 4, interleaved count: 1)
+
+; We are vectorizing with 6 runtime checks.
+;CHECK-LABEL: func1x6(
+;CHECK: <4 x i32>
+;CHECK: ret
+;OVERRIDE-LABEL: func1x6(
+;OVERRIDE: <4 x i32>
+;OVERRIDE: ret
+define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.016 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.016
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i64 %i.016
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %i.016
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %add, %2
+  %arrayidx4 = getelementptr inbounds i32, i32* %E, i64 %i.016
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %add3, %3
+  %arrayidx6 = getelementptr inbounds i32, i32* %F, i64 %i.016
+  %4 = load i32, i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %add5, %4
+  %arrayidx8 = getelementptr inbounds i32, i32* %out, i64 %i.016
+  store i32 %add7, i32* %arrayidx8, align 4
+  %inc = add i64 %i.016, 1
+  %exitcond = icmp eq i64 %inc, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 undef
+}
+
+; We are not vectorizing with 12 runtime checks.
+;CHECK-LABEL: func2x6(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret
+; We vectorize with 12 checks if a vectorization hint is provided.
+;OVERRIDE-LABEL: func2x6(
+;OVERRIDE: <4 x i32>
+;OVERRIDE: ret
+define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.037 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.037
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %B, i64 %i.037
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %i.037
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %add, %2
+  %arrayidx4 = getelementptr inbounds i32, i32* %E, i64 %i.037
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %add3, %3
+  %arrayidx6 = getelementptr inbounds i32, i32* %F, i64 %i.037
+  %4 = load i32, i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %add5, %4
+  %arrayidx8 = getelementptr inbounds i32, i32* %out, i64 %i.037
+  store i32 %add7, i32* %arrayidx8, align 4
+  %5 = load i32, i32* %arrayidx, align 4
+  %6 = load i32, i32* %arrayidx1, align 4
+  %add11 = add nsw i32 %6, %5
+  %7 = load i32, i32* %arrayidx2, align 4
+  %add13 = add nsw i32 %add11, %7
+  %8 = load i32, i32* %arrayidx4, align 4
+  %add15 = add nsw i32 %add13, %8
+  %9 = load i32, i32* %arrayidx6, align 4
+  %add17 = add nsw i32 %add15, %9
+  %arrayidx18 = getelementptr inbounds i32, i32* %out2, i64 %i.037
+  store i32 %add17, i32* %arrayidx18, align 4
+  %inc = add i64 %i.037, 1
+  %exitcond = icmp eq i64 %inc, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/safegep.ll b/llvm/test/Transforms/LoopVectorize/safegep.ll
new file mode 100644
index 00000000000..ecef8138f0b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/safegep.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1  < %s |  FileCheck %s
+target datalayout = "e-p:32:32:32-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
+
+
+; We can vectorize this code because if the address computation would wrap then
+; a load from 0 would take place which is undefined behaviour in address space 0
+; according to LLVM IR semantics.
+
+; PR16592
+
+; CHECK-LABEL: @safe(
+; CHECK: <4 x float>
+
+define void @safe(float* %A, float* %B, float %K) {
+entry:
+  br label %"<bb 3>"
+
+"<bb 3>":
+  %i_15 = phi i32 [ 0, %entry ], [ %i_19, %"<bb 3>" ]
+  %pp3 = getelementptr float, float* %A, i32 %i_15
+  %D.1396_10 = load float, float* %pp3, align 4
+  %pp24 = getelementptr float, float* %B, i32 %i_15
+  %D.1398_15 = load float, float* %pp24, align 4
+  %D.1399_17 = fadd float %D.1398_15, %K
+  %D.1400_18 = fmul float %D.1396_10, %D.1399_17
+  store float %D.1400_18, float* %pp3, align 4
+  %i_19 = add nsw i32 %i_15, 1
+  %exitcond = icmp ne i32 %i_19, 64
+  br i1 %exitcond, label %"<bb 3>", label %return
+
+return:
+  ret void
+}
+
+; In a non-default address space we don't have this rule.
+
+; CHECK-LABEL: @notsafe(
+; CHECK-NOT: <4 x float>
+
+define void @notsafe(float addrspace(5) * %A, float* %B, float %K) {
+entry:
+  br label %"<bb 3>"
+
+"<bb 3>":
+  %i_15 = phi i32 [ 0, %entry ], [ %i_19, %"<bb 3>" ]
+  %pp3 = getelementptr float, float addrspace(5) * %A, i32 %i_15
+  %D.1396_10 = load float, float addrspace(5) * %pp3, align 4
+  %pp24 = getelementptr float, float* %B, i32 %i_15
+  %D.1398_15 = load float, float* %pp24, align 4
+  %D.1399_17 = fadd float %D.1398_15, %K
+  %D.1400_18 = fmul float %D.1396_10, %D.1399_17
+  store float %D.1400_18, float addrspace(5) * %pp3, align 4
+  %i_19 = add nsw i32 %i_15, 1
+  %exitcond = icmp ne i32 %i_19, 64
+  br i1 %exitcond, label %"<bb 3>", label %return
+
+return:
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/same-base-access.ll b/llvm/test/Transforms/LoopVectorize/same-base-access.ll
new file mode 100644
index 00000000000..8a67efe30a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/same-base-access.ll
@@ -0,0 +1,107 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -enable-if-conversion | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; This is kernel11 from "LivermoreLoops". We can't vectorize it because we
+; access both x[k] and x[k-1].
+;
+; void kernel11(double *x, double *y, int n) {
+;   for ( int k=1 ; k<n ; k++ )
+;     x[k] = x[k-1] + y[k];
+; }
+
+; CHECK-LABEL: @kernel11(
+; CHECK-NOT: <4 x double>
+; CHECK: ret
+define i32 @kernel11(double* %x, double* %y, i32 %n) nounwind uwtable ssp {
+  %1 = alloca double*, align 8
+  %2 = alloca double*, align 8
+  %3 = alloca i32, align 4
+  %k = alloca i32, align 4
+  store double* %x, double** %1, align 8
+  store double* %y, double** %2, align 8
+  store i32 %n, i32* %3, align 4
+  store i32 1, i32* %k, align 4
+  br label %4
+
+; <label>:4                                       ; preds = %25, %0
+  %5 = load i32, i32* %k, align 4
+  %6 = load i32, i32* %3, align 4
+  %7 = icmp slt i32 %5, %6
+  br i1 %7, label %8, label %28
+
+; <label>:8                                       ; preds = %4
+  %9 = load i32, i32* %k, align 4
+  %10 = sub nsw i32 %9, 1
+  %11 = sext i32 %10 to i64
+  %12 = load double*, double** %1, align 8
+  %13 = getelementptr inbounds double, double* %12, i64 %11
+  %14 = load double, double* %13, align 8
+  %15 = load i32, i32* %k, align 4
+  %16 = sext i32 %15 to i64
+  %17 = load double*, double** %2, align 8
+  %18 = getelementptr inbounds double, double* %17, i64 %16
+  %19 = load double, double* %18, align 8
+  %20 = fadd double %14, %19
+  %21 = load i32, i32* %k, align 4
+  %22 = sext i32 %21 to i64
+  %23 = load double*, double** %1, align 8
+  %24 = getelementptr inbounds double, double* %23, i64 %22
+  store double %20, double* %24, align 8
+  br label %25
+
+; <label>:25                                      ; preds = %8
+  %26 = load i32, i32* %k, align 4
+  %27 = add nsw i32 %26, 1
+  store i32 %27, i32* %k, align 4
+  br label %4
+
+; <label>:28                                      ; preds = %4
+  ret i32 0
+}
+
+
+; A[i*7] is scalarized, and the different scalars can in theory wrap
+; around and overwrite other scalar elements. However we can still
+; vectorize because we can version the loop to avoid this case.
+; 
+; void foo(int *a) {
+;   for (int i=0; i<256; ++i) {
+;     int x = a[i*7];
+;     if (x>3)
+;       x = x*x+x*4;
+;     a[i*7] = x+3;
+;   }
+; }
+
+; CHECK-LABEL: @func2(
+; CHECK: <4 x i32>
+; CHECK: ret
+define i32 @func2(i32* nocapture %a) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %7, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %7 ]
+  %2 = mul nsw i64 %indvars.iv, 7
+  %3 = getelementptr inbounds i32, i32* %a, i64 %2
+  %4 = load i32, i32* %3, align 4
+  %5 = icmp sgt i32 %4, 3
+  br i1 %5, label %6, label %7
+
+; <label>:6                                       ; preds = %1
+  %tmp = add i32 %4, 4
+  %tmp1 = mul i32 %tmp, %4
+  br label %7
+
+; <label>:7                                       ; preds = %6, %1
+  %x.0 = phi i32 [ %tmp1, %6 ], [ %4, %1 ]
+  %8 = add nsw i32 %x.0, 3
+  store i32 %8, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %7
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/scalar-select.ll b/llvm/test/Transforms/LoopVectorize/scalar-select.ll
new file mode 100644
index 00000000000..3004ace1738
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalar-select.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+; make sure that we have a scalar condition and a vector operand.
+;CHECK: select i1 %cond, <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i1 %cond) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
new file mode 100644
index 00000000000..bd806aea06f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -force-vector-width=4 -force-vector-interleave=2 -loop-vectorize -instcombine -S | FileCheck %s
+; RUN: opt < %s -force-vector-width=4 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s --check-prefix=NO-IC
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @scalar_after_vectorization_0
+;
+; CHECK: vector.body:
+; CHECK:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:   %offset.idx = or i64 %index, 1
+; CHECK:   %[[T2:.+]] = add nuw nsw i64 %offset.idx, %tmp0
+; CHECK:   %[[T3:.+]] = sub nsw i64 %[[T2]], %x
+; CHECK:   %[[T4:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T3]]
+; CHECK:   %[[T5:.+]] = bitcast i32* %[[T4]] to <4 x i32>*
+; CHECK:   load <4 x i32>, <4 x i32>* %[[T5]], align 4
+; CHECK:   %[[T6:.+]] = getelementptr inbounds i32, i32* %[[T4]], i64 4
+; CHECK:   %[[T7:.+]] = bitcast i32* %[[T6]] to <4 x i32>*
+; CHECK:   load <4 x i32>, <4 x i32>* %[[T7]], align 4
+; CHECK:   br {{.*}}, label %middle.block, label %vector.body
+;
+; NO-IC-LABEL: @scalar_after_vectorization_0
+;
+; NO-IC: vector.body:
+; NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; NO-IC:   %offset.idx = add i64 1, %index
+; NO-IC:   %[[T2:.+]] = add i64 %offset.idx, 0
+; NO-IC:   %[[T3:.+]] = add i64 %offset.idx, 4
+; NO-IC:   %[[T4:.+]] = add nuw nsw i64 %[[T2]], %tmp0
+; NO-IC:   %[[T5:.+]] = add nuw nsw i64 %[[T3]], %tmp0
+; NO-IC:   %[[T6:.+]] = sub nsw i64 %[[T4]], %x
+; NO-IC:   %[[T7:.+]] = sub nsw i64 %[[T5]], %x
+; NO-IC:   %[[T8:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T6]]
+; NO-IC:   %[[T9:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T7]]
+; NO-IC:   %[[T10:.+]] = getelementptr inbounds i32, i32* %[[T8]], i32 0
+; NO-IC:   %[[T11:.+]] = bitcast i32* %[[T10]] to <4 x i32>*
+; NO-IC:   load <4 x i32>, <4 x i32>* %[[T11]], align 4
+; NO-IC:   %[[T12:.+]] = getelementptr inbounds i32, i32* %[[T8]], i32 4
+; NO-IC:   %[[T13:.+]] = bitcast i32* %[[T12]] to <4 x i32>*
+; NO-IC:   load <4 x i32>, <4 x i32>* %[[T13]], align 4
+; NO-IC:   br {{.*}}, label %middle.block, label %vector.body
+;
+define void @scalar_after_vectorization_0(i32* noalias %a, i32* noalias %b, i64 %x, i64 %y) {
+
+outer.ph:
+  br label %outer.body
+
+outer.body:
+  %i = phi i64 [ 1, %outer.ph ], [ %i.next, %inner.end ]
+  %tmp0 = mul nuw nsw i64 %i, %x
+  br label %inner.ph
+
+inner.ph:
+  br label %inner.body
+
+inner.body:
+  %j = phi i64 [ 1, %inner.ph ], [ %j.next, %inner.body ]
+  %tmp1 = add nuw nsw i64 %j, %tmp0
+  %tmp2 = sub nsw i64 %tmp1, %x
+  %tmp3 = getelementptr inbounds i32, i32* %a, i64 %tmp2
+  %tmp4 = load i32, i32* %tmp3, align 4
+  %tmp5 = getelementptr inbounds i32, i32* %b, i64 %tmp1
+  store i32 %tmp4, i32* %tmp5, align 4
+  %j.next = add i64 %j, 1
+  %cond.j = icmp slt i64 %j.next, %y
+  br i1 %cond.j, label %inner.body, label %inner.end
+
+inner.end:
+  %i.next = add i64 %i, 1
+  %cond.i = icmp slt i64 %i.next, %y
+  br i1 %cond.i, label %outer.body, label %outer.end
+
+outer.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
new file mode 100644
index 00000000000..833db9310d3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -0,0 +1,113 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=8 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@b = common global i32 0, align 4
+@f = common global i32 0, align 4
+@a = common global i32 0, align 4
+@d = common global i32* null, align 8
+@e = common global i32* null, align 8
+@c = common global i32 0, align 4
+
+; CHECK-LABEL: @fn1(
+; CHECK: vector.body
+define void @fn1() #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond, %entry
+  %i.0 = phi i32 [ undef, %entry ], [ %inc, %for.cond ]
+  %cmp = icmp slt i32 %i.0, 0
+  %call = tail call i32 @fn2(double fadd (double fsub (double undef, double undef), double 1.000000e+00)) #2
+  %inc = add nsw i32 %i.0, 1
+  br i1 %cmp, label %for.cond, label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.cond
+  %call.lcssa = phi i32 [ %call, %for.cond ]
+  %cmp514 = icmp sgt i32 %call.lcssa, 0
+  br i1 %cmp514, label %for.cond7.preheader.lr.ph, label %for.end26
+
+for.cond7.preheader.lr.ph:                        ; preds = %for.cond4.preheader
+  %0 = load i32*, i32** @e, align 8, !tbaa !4
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.cond7.preheader.lr.ph, %for.inc23
+  %y.017 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %inc24, %for.inc23 ]
+  %i.116 = phi i32 [ 0, %for.cond7.preheader.lr.ph ], [ %i.2.lcssa, %for.inc23 ]
+  %n.015 = phi i32 [ undef, %for.cond7.preheader.lr.ph ], [ %inc25, %for.inc23 ]
+  %1 = load i32, i32* @b, align 4, !tbaa !5
+  %tobool11 = icmp eq i32 %1, 0
+  br i1 %tobool11, label %for.inc23, label %for.body8.lr.ph
+
+for.body8.lr.ph:                                  ; preds = %for.cond7.preheader
+  %add9 = add i32 %n.015, 1
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body8.lr.ph, %for.inc19
+  %indvars.iv19 = phi i64 [ 0, %for.body8.lr.ph ], [ %indvars.iv.next20, %for.inc19 ]
+  %i.213 = phi i32 [ %i.116, %for.body8.lr.ph ], [ 0, %for.inc19 ]
+  %2 = trunc i64 %indvars.iv19 to i32
+  %add10 = add i32 %add9, %2
+  store i32 %add10, i32* @f, align 4, !tbaa !5
+  %idx.ext = sext i32 %add10 to i64
+  %add.ptr = getelementptr inbounds i32, i32* @a, i64 %idx.ext
+  %tobool129 = icmp eq i32 %i.213, 0
+  br i1 %tobool129, label %for.inc19, label %for.body13.lr.ph
+
+for.body13.lr.ph:                                 ; preds = %for.body8
+  %3 = sext i32 %i.213 to i64
+  br label %for.body13
+
+for.body13:                                       ; preds = %for.body13.lr.ph, %for.body13
+  %indvars.iv = phi i64 [ %3, %for.body13.lr.ph ], [ %indvars.iv.next, %for.body13 ]
+  %add.ptr.sum = add i64 %idx.ext, %indvars.iv
+  %arrayidx = getelementptr inbounds i32, i32* @a, i64 %add.ptr.sum
+  %4 = load i32, i32* %arrayidx, align 4, !tbaa !5
+  %arrayidx15 = getelementptr inbounds i32, i32* %0, i64 %indvars.iv
+  store i32 %4, i32* %arrayidx15, align 4, !tbaa !5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %5 = trunc i64 %indvars.iv.next to i32
+  %tobool12 = icmp eq i32 %5, 0
+  br i1 %tobool12, label %for.cond11.for.inc19_crit_edge, label %for.body13
+
+for.cond11.for.inc19_crit_edge:                   ; preds = %for.body13
+  br label %for.inc19
+
+for.inc19:                                        ; preds = %for.cond11.for.inc19_crit_edge, %for.body8
+  %6 = load i32, i32* @c, align 4, !tbaa !5
+  %inc20 = add nsw i32 %6, 1
+  store i32 %inc20, i32* @c, align 4, !tbaa !5
+  %indvars.iv.next20 = add i64 %indvars.iv19, 1
+  %7 = load i32, i32* @b, align 4, !tbaa !5
+  %tobool = icmp eq i32 %7, 0
+  br i1 %tobool, label %for.cond7.for.inc23_crit_edge, label %for.body8
+
+for.cond7.for.inc23_crit_edge:                    ; preds = %for.inc19
+  %add.ptr.lcssa = phi i32* [ %add.ptr, %for.inc19 ]
+  store i32* %add.ptr.lcssa, i32** @d, align 8, !tbaa !4
+  br label %for.inc23
+
+for.inc23:                                        ; preds = %for.cond7.for.inc23_crit_edge, %for.cond7.preheader
+  %i.2.lcssa = phi i32 [ 0, %for.cond7.for.inc23_crit_edge ], [ %i.116, %for.cond7.preheader ]
+  %inc24 = add nsw i32 %y.017, 1
+  %inc25 = add nsw i32 %n.015, 1
+  %exitcond = icmp ne i32 %inc24, %call.lcssa
+  br i1 %exitcond, label %for.cond7.preheader, label %for.cond4.for.end26_crit_edge
+
+for.cond4.for.end26_crit_edge:                    ; preds = %for.inc23
+  br label %for.end26
+
+for.end26:                                        ; preds = %for.cond4.for.end26_crit_edge, %for.cond4.preheader
+  ret void
+}
+declare i32 @fn2(double) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"double", !1}
+!4 = !{!0, !0, i64 0}
+!5 = !{!3, !3, i64 0}
diff --git a/llvm/test/Transforms/LoopVectorize/simple-unroll.ll b/llvm/test/Transforms/LoopVectorize/simple-unroll.ll
new file mode 100644
index 00000000000..d90b08e538e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/simple-unroll.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK-LABEL: @inc(
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/skip-iterations.ll b/llvm/test/Transforms/LoopVectorize/skip-iterations.ll
new file mode 100644
index 00000000000..27007aa713b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/skip-iterations.ll
@@ -0,0 +1,181 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; tests skipping iterations within a VF through break/continue/gotos.
+
+; The main difficulty in vectorizing these loops in test1,test2 and test3 is
+; safely speculating that the widened load of A[i] should not fault if the
+; scalarized loop does not fault. For example, the
+; original load in the scalar loop may not fault, but the last iteration of the
+; vectorized load can fault (if it crosses a page boudary for example).
+; This last vector iteration is where *one* of the
+; scalar iterations lead to the early exit.
+
+; int test(int *A, int Length) {
+;   for (int i = 0; i < Length; i++) {
+;     if (A[i] > 10.0) goto end;
+;     A[i] = 0;
+;   }
+; end:
+;   return 0;
+; }
+; CHECK-LABEL: test1(
+; CHECK-NOT: <4 x i32>
+define i32 @test1(i32* nocapture %A, i32 %Length) {
+entry:
+  %cmp8 = icmp sgt i32 %Length, 0
+  br i1 %cmp8, label %for.body.preheader, label %end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %if.else
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.else ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %cmp1 = icmp sgt i32 %0, 10
+  br i1 %cmp1, label %end.loopexit, label %if.else
+
+if.else:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, %Length
+  br i1 %cmp, label %for.body, label %end.loopexit
+
+end.loopexit:                                     ; preds = %if.else, %for.body
+  br label %end
+
+end:                                              ; preds = %end.loopexit, %entry
+  ret i32 0
+}
+
+; We don't use anything from within the loop at the early exit path
+; so we do not need to know which iteration caused the early exit path.
+; bool test2(int *A, int Length, int K) {
+;   for (int i = 0; i < Length; i++) {
+;     if (A[i] == K) return true;
+;   }
+;   return false;
+; }
+; TODO: Today we do not vectorize this, but we could teach the vectorizer, once
+; the hard part of proving/speculating A[i:VF - 1] loads does not fault is handled by the
+; compiler/hardware.
+
+; CHECK-LABEL: test2(
+; CHECK-NOT: <4 x i32>
+define i32 @test2(i32* nocapture %A, i32 %Length, i32 %K) {
+entry:
+  %cmp8 = icmp sgt i32 %Length, 0
+  br i1 %cmp8, label %for.body.preheader, label %end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %if.else
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.else ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %ld = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %ld, %K
+  br i1 %cmp1, label %end.loopexit, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %trunc = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %trunc, %Length
+  br i1 %cmp, label %for.body, label %end.loopexit
+
+end.loopexit:                                     ; preds = %if.else, %for.body
+  %result.lcssa = phi i32 [ 1, %for.body ], [ 0, %if.else ]
+  br label %end
+
+end:                                              ; preds = %end.loopexit, %entry
+  %result = phi i32 [ %result.lcssa, %end.loopexit ], [ 0, %entry ]
+  ret i32 %result
+}
+
+; We use the IV in the early exit
+; so we need to know which iteration caused the early exit path.
+; int test3(int *A, int Length, int K) {
+;   for (int i = 0; i < Length; i++) {
+;     if (A[i] == K) return i;
+;   }
+;   return -1;
+; }
+; TODO: Today we do not vectorize this, but we could teach the vectorizer (once
+; we handle the speculation safety of the widened load).
+; CHECK-LABEL: test3(
+; CHECK-NOT: <4 x i32>
+define i32 @test3(i32* nocapture %A, i32 %Length, i32 %K) {
+entry:
+  %cmp8 = icmp sgt i32 %Length, 0
+  br i1 %cmp8, label %for.body.preheader, label %end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %if.else
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.else ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %ld = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %ld, %K
+  br i1 %cmp1, label %end.loopexit, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %trunc = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %trunc, %Length
+  br i1 %cmp, label %for.body, label %end.loopexit
+
+end.loopexit:                                     ; preds = %if.else, %for.body
+  %result.lcssa = phi i64 [ %indvars.iv, %for.body ], [ -1, %if.else ]
+  %res.trunc = trunc i64 %result.lcssa to i32
+  br label %end
+
+end:                                              ; preds = %end.loopexit, %entry
+  %result = phi i32 [ %res.trunc, %end.loopexit ], [ -1, %entry ]
+  ret i32 %result
+}
+
+; bool test4(int *A, int Length, int K, int J) {
+;   for (int i = 0; i < Length; i++) {
+;     if (A[i] == K) continue;
+;     A[i] = J;
+;   }
+; }
+; For this test, we vectorize and generate predicated stores to A[i].
+; CHECK-LABEL: test4(
+; CHECK: <4 x i32>
+define void @test4(i32* nocapture %A, i32 %Length, i32 %K, i32 %J) {
+entry:
+  %cmp8 = icmp sgt i32 %Length, 0
+  br i1 %cmp8, label %for.body.preheader, label %end.loopexit
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %if.else
+  %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %trunc = trunc i64 %indvars.iv.next to i32
+  %ld = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp eq i32 %ld, %K
+  br i1 %cmp1, label %latch, label %if.else
+
+if.else:
+  store i32 %J, i32* %arrayidx, align 4
+  br label %latch
+
+latch:                                          ; preds = %for.body
+  %cmp = icmp slt i32 %trunc, %Length
+  br i1 %cmp, label %for.body, label %end.loopexit
+
+end.loopexit:                                     ; preds = %if.else, %for.body
+  ret void
+}
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/LoopVectorize/small-loop.ll b/llvm/test/Transforms/LoopVectorize/small-loop.ll
new file mode 100644
index 00000000000..378283b464b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/small-loop.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count
+  br i1 %exitcond, label %8, label %1      ;           w/o scalar iteration overhead.
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK-LABEL: @bound1(
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret void
+define void @bound1(i32 %k) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %large = icmp sge i32 %lftr.wideiv, 8
+  %exitcond = icmp eq i32 %lftr.wideiv, %k
+  %realexit = or i1 %large, %exitcond 
+  br i1 %realexit, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/start-non-zero.ll b/llvm/test/Transforms/LoopVectorize/start-non-zero.ll
new file mode 100644
index 00000000000..bf1865127e3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/start-non-zero.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @start_at_nonzero(
+;CHECK: mul nuw <4 x i32>
+;CHECK: ret i32
+define i32 @start_at_nonzero(i32* nocapture %a, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  %cmp3 = icmp slt i32 %start, %end
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = sext i32 %start to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %mul = mul nuw i32 %1, 333
+  store i32 %mul, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %2, %end
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 4
+}
diff --git a/llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll b/llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll
new file mode 100644
index 00000000000..62b148e2c79
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/store-shuffle-bug.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@uf = common global [100 x i32] zeroinitializer, align 16
+@xi = common global [100 x i32] zeroinitializer, align 16
+@q = common global [100 x i32] zeroinitializer, align 16
+
+; PR16455
+
+
+; Due to a bug in the way we handled reverse induction stores we would generate
+; a shuffle too many.
+
+define void @t()  {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @t(
+; CHECK: vector.body:
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: [[VAR3:%[a-zA-Z0-9]+]] = add nsw <4 x i32> [[VAR2]], [[VAR1]]
+; CHECK: store <4 x i32> [[VAR3]]
+; CHECK: [[VAR4:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: add nsw <4 x i32> [[VAR3]], [[VAR4]]
+; CHECK-NOT: shufflevector
+
+for.body:
+  %indvars.iv = phi i64 [ 93, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = add i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* @uf, i64 0, i64 %0
+  %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* @xi, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx3, align 4
+  %2 = load i32, i32* %arrayidx, align 4
+  %add4 = add nsw i32 %2, %1
+  store i32 %add4, i32* %arrayidx, align 4
+  %arrayidx7 = getelementptr inbounds [100 x i32], [100 x i32]* @q, i64 0, i64 %0
+  %3 = load i32, i32* %arrayidx7, align 4
+  %add8 = add nsw i32 %add4, %3
+  store i32 %add8, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %4 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp ugt i32 %4, 2
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/struct_access.ll b/llvm/test/Transforms/LoopVectorize/struct_access.ll
new file mode 100644
index 00000000000..9284627632b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/struct_access.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%struct.coordinate = type { i32, i32 }
+
+; Make sure that we don't generate a wide load when accessing the struct.
+; struct coordinate {
+;  int x;
+;  int y;
+; };
+;
+;
+; int foo(struct coordinate *A, int n) {
+;
+;   int sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += A[i].x;
+;
+;   return sum;
+; }
+
+;CHECK-LABEL: @foo(
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret
+define i32 @foo(%struct.coordinate* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %x = getelementptr inbounds %struct.coordinate, %struct.coordinate* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %sum.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+%struct.lit = type { i32 }
+
+; Verify that we still vectorize the access if the struct has the same size as
+; the loaded element.
+; struct lit {
+;  int x;
+; };
+;
+;
+; int bar(struct lit *A, int n) {
+;
+;   int sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += A[i].x;
+;
+;   return sum;
+; }
+
+;CHECK-LABEL: @bar(
+;CHECK: load <4 x i32>
+;CHECK: ret
+define i32 @bar(%struct.lit* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %x = getelementptr inbounds %struct.lit, %struct.lit* %A, i64 %indvars.iv, i32 0
+  %0 = load i32, i32* %x, align 4
+  %add = add nsw i32 %0, %sum.05
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
new file mode 100644
index 00000000000..d02f9285450
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll
@@ -0,0 +1,101 @@
+; RUN: opt < %s  -tbaa -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s
+; RUN: opt < %s  -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -S | FileCheck %s --check-prefix=CHECK-NOTBAA
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind uwtable
+define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !tbaa !0
+  %conv = fptosi float %0 to i32
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4, !tbaa !4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+
+; TBAA partitions the accesses in this loop, so it can be vectorized without
+; runtime checks.
+
+; CHECK-LABEL: @test1
+; CHECK: entry:
+; CHECK-NEXT: br label %vector.body
+; CHECK: vector.body:
+
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
+
+; CHECK: ret i32 0
+
+; CHECK-NOTBAA-LABEL: @test1
+; CHECK-NOTBAA: icmp ugt i32*
+
+; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
+
+; CHECK-NOTBAA: ret i32 0
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @test2(i32* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !tbaa !0
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !tbaa !4
+  %conv = sitofp i32 %1 to float
+  %mul = fmul float %0, %conv
+  %arrayidx4 = getelementptr inbounds float, float* %c, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4, !tbaa !0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+
+; This test is like the first, except here there is still one runtime check
+; required. Without TBAA, however, two checks are required.
+
+; CHECK-LABEL: @test2
+; CHECK: icmp ugt float*
+; CHECK: icmp ugt float*
+; CHECK-NOT: icmp uge i32*
+
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK: ret i32 0
+
+; CHECK-NOTBAA-LABEL: @test2
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt i32*
+
+; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK-NOTBAA: ret i32 0
+}
+
+attributes #0 = { nounwind uwtable }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"int", !2, i64 0}
+
diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll
new file mode 100644
index 00000000000..56f8b3e83c7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll
@@ -0,0 +1,211 @@
+; This test verifies that the loop vectorizer will not vectorizes low trip count
+; loops that require runtime checks (Trip count is computed with profile info).
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_low_trip_count1(i32 %bound) {
+; Simple loop with low tripcount. Should not be vectorized.
+
+; CHECK-LABEL: @foo_low_trip_count1(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
+; The loop has a same invocation count with the function, but has a low
+; trip_count per invocation and not worth to vectorize.
+
+; CHECK-LABEL: @foo_low_trip_count2(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
+; The loop has low invocation count compare to the function invocation count,
+; but has a high trip count per invocation. Vectorize it.
+
+; CHECK-LABEL: @foo_low_trip_count3(
+; CHECK: vector.body:
+
+entry:
+  br i1 %cond, label %for.preheader, label %for.end, !prof !2
+
+for.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !3
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) {
+; Simple loop with low tripcount and inequality test for exit.
+; Should not be vectorized.
+
+; CHECK-LABEL: @foo_low_trip_count_icmp_sgt(
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp sgt i32 %i.08, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_low_trip_count() {
+; Simple loop with constant, small trip count and no profiling info.
+
+; CHECK-LABEL: @const_low_trip_count
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp slt i32 %i.08, 2
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_large_trip_count() {
+; Simple loop with constant large trip count and no profiling info.
+
+; CHECK-LABEL: @const_large_trip_count
+; CHECK: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp slt i32 %i.08, 1000
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_small_trip_count_step() {
+; Simple loop with static, small trip count and no profiling info.
+
+; CHECK-LABEL: @const_small_trip_count_step
+; CHECK-NOT: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 5
+  %exitcond = icmp slt i32 %i.08, 10
+  br i1 %exitcond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+define i32 @const_trip_over_profile() {
+; constant trip count takes precedence over profile data
+
+; CHECK-LABEL: @const_trip_over_profile
+; CHECK: <{{[0-9]+}} x i8>
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp slt i32 %i.08, 1000
+  br i1 %exitcond, label %for.body, label %for.end, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+!0 = !{!"function_entry_count", i64 100}
+!1 = !{!"branch_weights", i32 100, i32 0}
+!2 = !{!"branch_weights", i32 10, i32 90}
+!3 = !{!"branch_weights", i32 10, i32 10000}
diff --git a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
new file mode 100644
index 00000000000..e9d053cf0be
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; We use to fail on this loop because we did not properly handle the loop
+; invariant instruction anchored in the loop when used as a getelementptr index.
+; We would use the index from the original loop resulting in a use not dominated
+; by the definition.
+
+; PR16452
+
+; Verify that we don't miscompile this loop.
+
+; CHECK-LABEL: @t(
+; CHECK: <4 x i32>
+
+define void @t() {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv17 = phi i64 [ %indvars.next, %for.body ], [ 128, %entry ]
+
+  ; Loop invariant anchored in loop.
+  %idxprom21 = zext i32 undef to i64
+
+  %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* undef, i64 0, i64 %idxprom21, i64 %indvars.iv17
+  store i32 undef, i32* %arrayidx23, align 4
+  %indvars.next= add i64 %indvars.iv17, -1
+  %0 = trunc i64 %indvars.next to i32
+  %cmp15 = icmp ugt i32 %0, undef
+  br i1 %cmp15, label %for.body, label %loopexit
+
+loopexit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll b/llvm/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll
new file mode 100644
index 00000000000..d3112b82d1d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S | FileCheck --enable-var-scope %s
+
+; Make sure we attach memcheck metadata to scalarized memory operations even if
+; we're only unrolling.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: vector.memcheck:
+; CHECK-LABEL: vector.body:
+; CHECK: load i32, {{.*}} !alias.scope ![[$MD1:[0-9]+]]
+; CHECK-LABEL: middle.block:
+; CHECK-DAG: ![[$MD1]] = !{![[MD2:[0-9]+]]}
+; CHECK-DAG: ![[MD2]] = distinct !{![[MD2]], ![[MD3:[0-9]+]]}
+; CHECK-DAG: ![[MD3]] = distinct !{![[MD3]], !"LVerDomain"}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @test(i32* nocapture readonly %a, i32* nocapture %b) local_unnamed_addr #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 77
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable }
diff --git a/llvm/test/Transforms/LoopVectorize/unroll.ll b/llvm/test/Transforms/LoopVectorize/unroll.ll
new file mode 100644
index 00000000000..74076f69510
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unroll.ll
@@ -0,0 +1,37 @@
+; This test makes sure that loop will not be unrolled in vectorization if VF computed
+; equals to 1.
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; Make sure there are no geps being merged.
+; CHECK-LABEL: @foo(
+; CHECK: getelementptr
+; CHECK-NOT: getelementptr
+
+@N = common global i32 0, align 4
+@a = common global [1000 x i32] zeroinitializer, align 16
+
+define void @foo() #0 {
+entry:
+  %0 = load i32, i32* @N, align 4
+  %cmp5 = icmp sgt i32 %0, 0
+  br i1 %cmp5, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv = sext i32 %0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.06 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = mul nuw nsw i64 %i.06, 7
+  %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %mul
+  store i32 3, i32* %arrayidx, align 4
+  %inc = add nuw nsw i64 %i.06, 1
+  %cmp = icmp slt i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/unroll_novec.ll b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
new file mode 100644
index 00000000000..4f8db9e7156
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unroll_novec.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-target-num-scalar-regs=16 -force-target-max-scalar-interleave=8 -force-target-instruction-cost=1 -small-loop-cost=40 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK-LABEL: @inc(
+;CHECK: load i32, i32*
+;CHECK: load i32, i32*
+;CHECK: load i32, i32*
+;CHECK: load i32, i32*
+;CHECK-NOT: load i32, i32*
+;CHECK: add nsw i32
+;CHECK: add nsw i32
+;CHECK: add nsw i32
+;CHECK: add nsw i32
+;CHECK-NOT: add nsw i32
+;CHECK: store i32
+;CHECK: store i32
+;CHECK: store i32
+;CHECK: store i32
+;CHECK-NOT: store i32
+;CHECK: add i64 %{{.*}}, 4
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll
new file mode 100644
index 00000000000..78e128d897e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll
@@ -0,0 +1,73 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -pass-remarks-analysis=loop-vectorize < %s 2>&1 | FileCheck %s
+
+; ModuleID = '/tmp/kk.c'
+source_filename = "/tmp/kk.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+;     1	void success (char *A, char *B, char *C, char *D, char *E, int N) {
+;     2	  for(int i = 0; i < N; i++) {
+;     3	    A[i + 1] = A[i] + B[i];
+;     4	    C[i] = D[i] * E[i];
+;     5	  }
+;     6	}
+
+; CHECK: remark: /tmp/kk.c:3:16: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+
+define void @success(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) !dbg !6 {
+entry:
+  %cmp28 = icmp sgt i32 %N, 0, !dbg !8
+  br i1 %cmp28, label %for.body, label %for.cond.cleanup, !dbg !9
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv, !dbg !11
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !11, !tbaa !12
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 %indvars.iv, !dbg !15
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !15, !tbaa !12
+  %add = add i8 %1, %0, !dbg !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !9
+  %arrayidx7 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv.next, !dbg !17
+  store i8 %add, i8* %arrayidx7, align 1, !dbg !18, !tbaa !12
+  %arrayidx9 = getelementptr inbounds i8, i8* %D, i64 %indvars.iv, !dbg !19
+  %2 = load i8, i8* %arrayidx9, align 1, !dbg !19, !tbaa !12
+  %arrayidx12 = getelementptr inbounds i8, i8* %E, i64 %indvars.iv, !dbg !20
+  %3 = load i8, i8* %arrayidx12, align 1, !dbg !20, !tbaa !12
+  %mul = mul i8 %3, %2, !dbg !21
+  %arrayidx16 = getelementptr inbounds i8, i8* %C, i64 %indvars.iv, !dbg !22
+  store i8 %mul, i8* %arrayidx16, align 1, !dbg !23, !tbaa !12
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !9
+  %exitcond = icmp eq i32 %lftr.wideiv, %N, !dbg !9
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void, !dbg !10
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/kk.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"PIC Level", i32 2}
+!5 = !{!"clang version 3.9.0 "}
+!6 = distinct !DISubprogram(name: "success", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 2, column: 20, scope: !6)
+!9 = !DILocation(line: 2, column: 3, scope: !6)
+!10 = !DILocation(line: 6, column: 1, scope: !6)
+!11 = !DILocation(line: 3, column: 16, scope: !6)
+!12 = !{!13, !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 3, column: 23, scope: !6)
+!16 = !DILocation(line: 3, column: 21, scope: !6)
+!17 = !DILocation(line: 3, column: 5, scope: !6)
+!18 = !DILocation(line: 3, column: 14, scope: !6)
+!19 = !DILocation(line: 4, column: 12, scope: !6)
+!20 = !DILocation(line: 4, column: 19, scope: !6)
+!21 = !DILocation(line: 4, column: 17, scope: !6)
+!22 = !DILocation(line: 4, column: 5, scope: !6)
+!23 = !DILocation(line: 4, column: 10, scope: !6)
diff --git a/llvm/test/Transforms/LoopVectorize/unsized-pointee-crash.ll b/llvm/test/Transforms/LoopVectorize/unsized-pointee-crash.ll
new file mode 100644
index 00000000000..065d8e6e478
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsized-pointee-crash.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @fn1
+define void @fn1() {
+entry:
+  br label %for.body
+
+for.body:
+  %b.05 = phi i32 (...)* [ undef, %entry ], [ %1, %for.body ]
+  %a.04 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = bitcast i32 (...)* %b.05 to i8*
+  %add.ptr = getelementptr i8, i8* %0, i64 1
+  %1 = bitcast i8* %add.ptr to i32 (...)*
+; CHECK:      %[[cst:.*]] = bitcast i32 (...)* {{.*}} to i8*
+; CHECK-NEXT: %[[gep:.*]] = getelementptr i8, i8* %[[cst]], i64 1
+  %inc = add nsw i32 %a.04, 1
+  %exitcond = icmp eq i32 %a.04, 63
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/value-ptr-bug.ll b/llvm/test/Transforms/LoopVectorize/value-ptr-bug.ll
new file mode 100644
index 00000000000..ce4601f7b92
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/value-ptr-bug.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; PR16073
+
+; Because we were caching value pointers across a function call that could RAUW
+; we would generate an undefined value store below:
+; SCEVExpander::expandCodeFor would change a value (the start value of an
+; induction) that we cached in the induction variable list.
+
+; CHECK-LABEL: @test_vh(
+; CHECK-NOT: store <4 x i8> undef
+
+define void @test_vh(i32* %ptr265, i32* %ptr266, i32 %sub267) {
+entry:
+  br label %loop
+
+loop:
+  %inc = phi i32 [ %sub267, %entry ], [ %add, %loop]
+  %ext.inc = sext i32 %inc to i64
+  %add.ptr265 = getelementptr inbounds i32, i32* %ptr265, i64 %ext.inc
+  %add.ptr266 = getelementptr inbounds i32, i32* %ptr266, i64 %ext.inc
+  %add = add i32 %inc, 9
+  %cmp = icmp slt i32 %add, 140
+  br i1 %cmp, label %block1, label %loop
+
+block1:
+  %sub267.lcssa = phi i32 [ %add, %loop ]
+  %add.ptr266.lcssa = phi i32* [ %add.ptr266, %loop ]
+  %add.ptr265.lcssa = phi i32* [ %add.ptr265, %loop ]
+  %tmp29 = bitcast i32* %add.ptr265.lcssa to i8*
+  %tmp30 = bitcast i32* %add.ptr266.lcssa to i8*
+  br label %do.body272
+
+do.body272:
+  %row_width.5 = phi i32 [ %sub267.lcssa, %block1 ], [ %dec, %do.body272 ]
+  %sp.4 = phi i8* [ %tmp30, %block1 ], [ %incdec.ptr273, %do.body272 ]
+  %dp.addr.4 = phi i8* [ %tmp29, %block1 ], [ %incdec.ptr274, %do.body272 ]
+  %incdec.ptr273 = getelementptr inbounds i8, i8* %sp.4, i64 1
+  %tmp31 = load i8, i8* %sp.4, align 1
+  %incdec.ptr274 = getelementptr inbounds i8, i8* %dp.addr.4, i64 1
+  store i8 %tmp31, i8* %dp.addr.4, align 1
+  %dec = add i32 %row_width.5, -1
+  %cmp276 = icmp eq i32 %dec, 0
+  br i1 %cmp276, label %loop.exit, label %do.body272
+
+loop.exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll b/llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll
new file mode 100644
index 00000000000..4ddc6a65217
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll
@@ -0,0 +1,211 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s -check-prefix=VF8
+; RUN: opt -S -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix=VF1
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Given a loop with an induction variable which is being
+; truncated/extended using casts that had been proven to
+; be redundant under a runtime test, we want to make sure
+; that these casts, do not get vectorized/scalarized/widened. 
+; This is the case for inductions whose SCEV expression is
+; of the form "ExtTrunc(%phi) + %step", where "ExtTrunc"
+; can be a result of the IR sequences we check below.
+; 
+; See also pr30654.
+;
+
+; Case1: Check the following induction pattern:
+;
+;  %p.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+;  %sext = shl i32 %p.09, 24
+;  %conv = ashr exact i32 %sext, 24
+;  %add = add nsw i32 %conv, %step
+; 
+; This is the case in the following code:
+;
+; void doit1(int n, int step) {
+;   int i;
+;   char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+;
+; The "ExtTrunc" IR sequence here is:
+;  "%sext = shl i32 %p.09, 24"
+;  "%conv = ashr exact i32 %sext, 24"
+; We check that it does not appear in the vector loop body, whether
+; we vectorize or scalarize the induction.
+; In the case of widened induction, this means that the induction phi
+; is directly used, without shl/ashr on the way.
+
+; VF8-LABEL: @doit1
+; VF8: vector.body:
+; VF8: %vec.ind = phi <8 x i32>
+; VF8: store <8 x i32> %vec.ind
+; VF8: middle.block:            
+
+; VF1-LABEL: @doit1
+; VF1: vector.body:
+; VF1-NOT: %{{.*}} = shl i32
+; VF1: middle.block:            
+
+@a = common local_unnamed_addr global [250 x i32] zeroinitializer, align 16
+
+define void @doit1(i32 %n, i32 %step) {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %p.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %sext = shl i32 %p.09, 24
+  %conv = ashr exact i32 %sext, 24
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Case2: Another variant of the above pattern is where the induction variable
+; is used only for address compuation (i.e. it is a GEP index) and therefore
+; the induction is not vectorized but rather only the step is widened. 
+;
+; This is the case in the following code, where the induction variable 'w_ix' 
+; is only used to access the array 'in':
+;
+; void doit2(int *in, int *out, size_t size, size_t step)
+; {
+;    int w_ix = 0;
+;    for (size_t offset = 0; offset < size; ++offset)
+;     {
+;        int w = in[w_ix];
+;        out[offset] = w;
+;        w_ix += step;
+;     }
+; }
+;
+; The "ExtTrunc" IR sequence here is similar to the previous case:
+;  "%sext = shl i64 %w_ix.012, 32
+;  %idxprom = ashr exact i64 %sext, 32"
+; We check that it does not appear in the vector loop body, whether
+; we widen or scalarize the induction.
+; In the case of widened induction, this means that the induction phi
+; is directly used, without shl/ashr on the way.
+
+; VF8-LABEL: @doit2
+; VF8: vector.body:
+; VF8: %vec.ind = phi <8 x i64> 
+; VF8: %{{.*}} = extractelement <8 x i64> %vec.ind
+; VF8: middle.block:
+
+; VF1-LABEL: @doit2
+; VF1: vector.body:
+; VF1-NOT: %{{.*}} = shl i64
+; VF1: middle.block:
+;
+
+define void @doit2(i32* nocapture readonly %in, i32* nocapture %out, i64 %size, i64 %step)  {
+entry:
+  %cmp9 = icmp eq i64 %size, 0
+  br i1 %cmp9, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %w_ix.011 = phi i64 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %offset.010 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sext = shl i64 %w_ix.011, 32
+  %idxprom = ashr exact i64 %sext, 32
+  %arrayidx = getelementptr inbounds i32, i32* %in, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %offset.010
+  store i32 %0, i32* %arrayidx1, align 4
+  %add = add i64 %idxprom, %step
+  %inc = add nuw i64 %offset.010, 1
+  %exitcond = icmp eq i64 %inc, %size
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; Case3: Lastly, check also the following induction pattern:
+; 
+;  %p.09 = phi i32 [ %val0, %scalar.ph ], [ %add, %for.body ]
+;  %conv = and i32 %p.09, 255
+;  %add = add nsw i32 %conv, %step
+; 
+; This is the case in the following code:
+;
+; int a[N];
+; void doit3(int n, int step) {
+;   int i;
+;   unsigned char p = 0;
+;   for (i = 0; i < n; i++) {
+;      a[i] = p;
+;      p = p + step;
+;   }
+; }
+; 
+; The "ExtTrunc" IR sequence here is:
+;  "%conv = and i32 %p.09, 255".
+; We check that it does not appear in the vector loop body, whether
+; we vectorize or scalarize the induction.
+
+; VF8-LABEL: @doit3
+; VF8: vector.body:
+; VF8: %vec.ind = phi <8 x i32>
+; VF8: store <8 x i32> %vec.ind
+; VF8: middle.block:            
+
+; VF1-LABEL: @doit3
+; VF1: vector.body:
+; VF1-NOT: %{{.*}} = and i32 
+; VF1: middle.block:            
+
+define void @doit3(i32 %n, i32 %step) {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %p.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+  %conv = and i32 %p.09, 255
+  %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add nsw i32 %conv, %step
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll b/llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
new file mode 100644
index 00000000000..995fc1f2c3d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -O2 -force-vector-interleave=2 -force-vector-width=4 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Checking a loop in "foo"
+; CHECK: LV: Loop hints: force=enabled
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Ensure that "llvm.loop.vectorize.enable" metadata was not lost even
+; if loop was not rotated.
+;
+; See http://reviews.llvm.org/D3348 for details.
+;
+; CHECK-LABEL: @foo
+; CHECK: !llvm.loop !0
+; CHECK: !0 = distinct !{!0, !1}
+; CHECK: !1 = !{!"llvm.loop.vectorize.enable", i1 true}
+define i32 @foo(i32 %a) {
+entry:
+  br label %loop_cond
+
+loop_cond:
+  %indx = phi i32 [ 1, %entry ], [ %inc, %loop_inc ]
+  %cmp = icmp ne i32 %indx, %a
+  br i1 %cmp, label %return, label %loop_inc
+
+loop_inc:
+  %inc = add i32 %indx, 1
+  br label %loop_cond, !llvm.loop !0
+
+return:
+  ret i32 0
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/vect.stats.ll b/llvm/test/Transforms/LoopVectorize/vect.stats.ll
new file mode 100644
index 00000000000..909d9c28398
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vect.stats.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+;
+; We have 2 loops, one of them is vectorizable and the second one is not.
+;
+
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+define void @vectorized(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sle i64 %size, 0
+  %cmp21 = icmp sgt i64 0, %size
+  %or.cond = or i1 %cmp1, %cmp21
+  br i1 %or.cond, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv2 = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv2
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, %0
+  store float %mul, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1
+  %cmp2 = icmp sgt i64 %indvars.iv.next, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.end:                                          ; preds = %entry, %for.body
+  ret void
+}
+
+define void @not_vectorized(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sle i64 %size, 0
+  %cmp21 = icmp sgt i64 0, %size
+  %or.cond = or i1 %cmp1, %cmp21
+  br i1 %or.cond, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv2 = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %0 = add nsw i64 %indvars.iv2, -5
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %0
+  %1 = load float, float* %arrayidx, align 4
+  %2 = add nsw i64 %indvars.iv2, 2
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %2
+  %3 = load float, float* %arrayidx2, align 4
+  %mul = fmul float %1, %3
+  %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv2
+  store float %mul, float* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv2, 1
+  %cmp2 = icmp sgt i64 %indvars.iv.next, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.end:                                          ; preds = %entry, %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
new file mode 100644
index 00000000000..bd79499d5d3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @vector_gep_stored(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @uniform_vector_gep_stored(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 1
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-once.ll b/llvm/test/Transforms/LoopVectorize/vectorize-once.ll
new file mode 100644
index 00000000000..d407ac1719c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-once.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S -simplifycfg | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;
+; We want to make sure that we are vectorizeing the scalar loop only once
+; even if the pass manager runs the vectorizer multiple times due to inlining.
+
+
+; This test checks that we add metadata to vectorized loops
+; CHECK-LABEL: @_Z4foo1Pii(
+; CHECK: <4 x i32>
+; CHECK: llvm.loop
+; CHECK: ret
+
+; This test comes from the loop:
+;
+;int foo (int *A, int n) {
+;  return std::accumulate(A, A + n, 0);
+;}
+define i32 @_Z4foo1Pii(i32* %A, i32 %n) #0 {
+entry:
+  %idx.ext = sext i32 %n to i64
+  %add.ptr = getelementptr inbounds i32, i32* %A, i64 %idx.ext
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %entry, %for.body.i
+  %__init.addr.05.i = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %__first.addr.04.i = phi i32* [ %incdec.ptr.i, %for.body.i ], [ %A, %entry ]
+  %0 = load i32, i32* %__first.addr.04.i, align 4
+  %add.i = add nsw i32 %0, %__init.addr.05.i
+  %incdec.ptr.i = getelementptr inbounds i32, i32* %__first.addr.04.i, i64 1
+  %cmp.i = icmp eq i32* %incdec.ptr.i, %add.ptr
+  br i1 %cmp.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+; This test checks that we don't vectorize loops that are marked with the "width" == 1 metadata.
+; CHECK-LABEL: @_Z4foo2Pii(
+; CHECK-NOT: <4 x i32>
+; CHECK: llvm.loop
+; CHECK: ret
+define i32 @_Z4foo2Pii(i32* %A, i32 %n) #0 {
+entry:
+  %idx.ext = sext i32 %n to i64
+  %add.ptr = getelementptr inbounds i32, i32* %A, i64 %idx.ext
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %entry, %for.body.i
+  %__init.addr.05.i = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %__first.addr.04.i = phi i32* [ %incdec.ptr.i, %for.body.i ], [ %A, %entry ]
+  %0 = load i32, i32* %__first.addr.04.i, align 4
+  %add.i = add nsw i32 %0, %__init.addr.05.i
+  %incdec.ptr.i = getelementptr inbounds i32, i32* %__first.addr.04.i, i64 1
+  %cmp.i = icmp eq i32* %incdec.ptr.i, %add.ptr
+  br i1 %cmp.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i, !llvm.loop !0
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" }
+
+; CHECK: !0 = distinct !{!0, !1}
+; CHECK: !1 = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: !2 = distinct !{!2, !3, !1}
+; CHECK: !3 = !{!"llvm.loop.unroll.runtime.disable"}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.width", i32 1}
diff --git a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
new file mode 100644
index 00000000000..774b6f26859
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll
@@ -0,0 +1,94 @@
+; RUN: opt -basicaa -loop-vectorize -enable-mem-access-versioning -force-vector-width=2 -force-vector-interleave=1 < %s -S | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that we version this loop with speculating the value 1 for symbolic
+; strides.  This also checks that the symbolic stride information is correctly
+; propagated to the memcheck generation.  Without this the loop wouldn't
+; vectorize because we couldn't determine the array bounds for the required
+; memchecks.
+
+; CHECK-LABEL: test
+define void @test(i32*  %A, i64 %AStride,
+                  i32*  %B, i32 %BStride,
+                  i32*  %C, i64 %CStride, i32 %N) {
+entry:
+  %cmp13 = icmp eq i32 %N, 0
+  br i1 %cmp13, label %for.end, label %for.body.preheader
+
+; CHECK-DAG: icmp ne i64 %AStride, 1
+; CHECK-DAG: icmp ne i32 %BStride, 1
+; CHECK-DAG: icmp ne i64 %CStride, 1
+; CHECK: or
+; CHECK: or
+; CHECK: br
+
+; CHECK: vector.body
+; CHECK: load <2 x i32>
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %iv.trunc = trunc i64 %indvars.iv to i32
+  %mul = mul i32 %iv.trunc, %BStride
+  %mul64 = zext i32 %mul to i64
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %mul64
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul2 = mul nsw i64 %indvars.iv, %CStride
+  %arrayidx3 = getelementptr inbounds i32, i32* %C, i64 %mul2
+  %1 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %1, %0
+  %mul3 = mul nsw i64 %indvars.iv, %AStride
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 %mul3
+  store i32 %mul4, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; We used to crash on this function because we removed the fptosi cast when
+; replacing the symbolic stride '%conv'.
+; PR18480
+
+; CHECK-LABEL: fn1
+; CHECK: load <2 x double>
+
+define void @fn1(double* noalias %x, double* noalias %c, double %a) {
+entry:
+  %conv = fptosi double %a to i32
+  %conv2 = add i32 %conv, 4
+  %cmp8 = icmp sgt i32 %conv2, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %0, %conv
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds double, double* %x, i64 %idxprom
+  %1 = load double, double* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %c, i64 %indvars.iv
+  store double %1, double* %arrayidx3, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %conv2
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll b/llvm/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll
new file mode 100644
index 00000000000..23e00749eb4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-stress-test-no-explict-vf.ll
@@ -0,0 +1,45 @@
+; REQUIRES: asserts
+; RUN: opt < %s  -S -loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1  | FileCheck %s
+
+; This test checks that, when stress testing VPlan, if the computed VF
+; is 1, we override it to VF = 4.
+
+; CHECK: LV: VPlan computed VF 1.
+; CHECK: LV: VPlan stress testing: overriding computed VF.
+; CHECK: LV: Using VF 4 to build VPlans.
+@arr2 = external global [8 x i32], align 16
+@arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll b/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll
new file mode 100644
index 00000000000..224e7e8ce1e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan_hcfg_stress_test.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -vplan-verify-hcfg -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s -check-prefix=VERIFIER
+; RUN: opt < %s -loop-vectorize -enable-vplan-native-path -vplan-build-stress-test -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s -check-prefix=NO-VERIFIER -allow-empty
+; REQUIRES: asserts
+
+; Verify that the stress testing flag for the VPlan H-CFG builder works as
+; expected with and without enabling the VPlan H-CFG Verifier.
+
+; VERIFIER: Verifying VPlan H-CFG.
+; NO-VERIFIER-NOT: Verifying VPlan H-CFG.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i32* nocapture %a, i32* nocapture readonly %b, i32 %N, i32 %M) {
+entry:
+  %cmp32 = icmp sgt i32 %N, 0
+  br i1 %cmp32, label %outer.ph, label %for.end15
+
+outer.ph:
+  %cmp230 = icmp sgt i32 %M, 0
+  %0 = sext i32 %M to i64
+  %wide.trip.count = zext i32 %M to i64
+  %wide.trip.count38 = zext i32 %N to i64
+  br label %outer.body
+
+outer.body:
+  %indvars.iv35 = phi i64 [ 0, %outer.ph ], [ %indvars.iv.next36, %outer.inc ]
+  br i1 %cmp230, label %inner.ph, label %outer.inc
+
+inner.ph:
+  %1 = mul nsw i64 %indvars.iv35, %0
+  br label %inner.body
+
+inner.body:
+  %indvars.iv = phi i64 [ 0, %inner.ph ], [ %indvars.iv.next, %inner.body ]
+  %2 = add nsw i64 %indvars.iv, %1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %3, i32* %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %outer.inc, label %inner.body
+
+outer.inc:
+  %indvars.iv.next36 = add nuw nsw i64 %indvars.iv35, 1
+  %exitcond39 = icmp eq i64 %indvars.iv.next36, %wide.trip.count38
+  br i1 %exitcond39, label %for.end15, label %outer.body
+
+for.end15:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/write-only.ll b/llvm/test/Transforms/LoopVectorize/write-only.ll
new file mode 100644
index 00000000000..ae3eb2046f6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/write-only.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK-LABEL: @read_mod_write_single_ptr(
+;CHECK: load <4 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/zero-sized-pointee-crash.ll b/llvm/test/Transforms/LoopVectorize/zero-sized-pointee-crash.ll
new file mode 100644
index 00000000000..90df5cfa8c1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/zero-sized-pointee-crash.ll
@@ -0,0 +1,26 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: @fn1
+define void @fn1() {
+entry-block:
+  br label %middle
+
+middle:
+  %0 = phi {}* [ %3, %middle ], [ inttoptr (i64 0 to {}*), %entry-block ]
+  %1 = bitcast {}* %0 to i8*
+  %2 = getelementptr i8, i8* %1, i64 1
+  %3 = bitcast i8* %2 to {}*
+  %4 = icmp eq i8* %2, undef
+  br i1 %4, label %exit, label %middle
+
+; CHECK:      %[[phi:.*]] = phi {}* [ %3, %middle ], [ null, %entry-block ]
+; CHECK-NEXT: %[[bc1:.*]] = bitcast {}* %[[phi]] to i8*
+; CHECK-NEXT: %[[gep:.*]] = getelementptr i8, i8* %[[bc1]], i64 1
+; CHECK-NEXT: %[[bc2:.*]] = bitcast i8* %[[gep]] to {}*
+; CHECK-NEXT: %[[cmp:.*]] = icmp eq i8* %[[gep]], undef
+; CHECK-NEXT: br i1 %[[cmp]],
+
+exit:
+  ret void
+}