Revert "Temporarily Revert "Add basic loop fusion pass.""

The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552
author: Eric Christopher <echristo@gmail.com> 2019-04-17 04:52:47 +0000
committer: Eric Christopher <echristo@gmail.com> 2019-04-17 04:52:47 +0000
commit: cee313d288a4faf0355d76fb6e0e927e211d08a5 (patch)
tree: d386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/LoopVectorize/X86
parent: c3d6a929fdd92fd06d4304675ade8d7210ee711a (diff)
download: bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz
bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip
86 files changed, 16477 insertions, 0 deletions
diff --git a/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
new file mode 100644
index 00000000000..60dd07698a3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/already-vectorized.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+; We want to make sure that we don't even try to vectorize loops again
+; The vectorizer used to mark the un-vectorized loop only as already vectorized
+; thus, trying to vectorize the vectorized loop again
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external global [255 x i32]
+
+; Function Attrs: nounwind readonly uwtable
+define i32 @vect() {
+; CHECK: LV: Checking a loop in "vect"
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; We need to make sure we did vectorize the loop
+; CHECK: LV: Found a loop: for.body
+; CHECK: LV: We can vectorize this loop!
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %red.05
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+; If it did, we have two loops:
+; CHECK: vector.body:
+; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
+; CHECK: for.body:
+; CHECK: br {{.*}} label %for.body, !llvm.loop [[scalar:![0-9]+]]
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}
+
+; Now, we check for the Hint metadata
+; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]]}
+; CHECK: [[width]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]]}
+; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/assume.ll b/llvm/test/Transforms/LoopVectorize/X86/assume.ll
new file mode 100644
index 00000000000..4fd378d1a0a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/assume.ll
@@ -0,0 +1,100 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 %cmp1)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
+%struct.data = type { float*, float* }
+
+; Function Attrs: nounwind uwtable
+define void @test2(%struct.data* nocapture readonly %d) #0 {
+entry:
+  %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
+  %0 = load float*, float** %b, align 8
+  %ptrint = ptrtoint float* %0 to i64
+  %maskedptr = and i64 %ptrint, 31
+  %maskcond = icmp eq i64 %maskedptr, 0
+  %a = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 0
+  %1 = load float*, float** %a, align 8
+  %ptrint2 = ptrtoint float* %1 to i64
+  %maskedptr3 = and i64 %ptrint2, 31
+  %maskcond4 = icmp eq i64 %maskedptr3, 0
+  br label %for.body
+
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: @llvm.assume
+; CHECK: for.body:
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds float, float* %0, i64 %indvars.iv
+  %2 = load float, float* %arrayidx, align 4
+  %add = fadd float %2, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond4)
+  %arrayidx5 = getelementptr inbounds float, float* %1, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx1.ll b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
new file mode 100644
index 00000000000..d384a8162ba
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s  -loop-vectorize -mattr=avx,+slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=SLOWMEM32 --check-prefix=CHECK
+; RUN: opt < %s  -loop-vectorize -mattr=avx,-slow-unaligned-mem-32 -S | FileCheck %s --check-prefix=FASTMEM32 --check-prefix=CHECK
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: @read_mod_write_single_ptr(
+; CHECK: load <8 x float>
+; CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %3 = load float, float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+; CHECK-LABEL: @read_mod_i64(
+; SLOWMEM32: load <2 x i64>
+; FASTMEM32: load <4 x i64>
+; CHECK: ret i32
+define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 4
+  %4 = add i64 %3, 3
+  store i64 %4, i64* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/avx512.ll b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
new file mode 100644
index 00000000000..0917e007224
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -0,0 +1,112 @@
+; RUN: opt -mattr=+avx512f --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s
+; RUN: opt -mattr=+avx512vl,+prefer-256-bit --loop-vectorize -S < %s | llc -mattr=+avx512f | FileCheck %s --check-prefix=CHECK-PREFER-AVX256
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Verify that we generate 512-bit wide vectors for a basic integer memset
+; loop.
+
+; CHECK-LABEL: f:
+; CHECK: vmovdqu64 %zmm{{.}},
+; CHECK-NOT: %ymm
+
+; Verify that we don't generate 512-bit wide vectors when subtarget feature says not to
+
+; CHECK-PREFER-AVX256-LABEL: f:
+; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
+; CHECK-PREFER-AVX256-NOT: %zmm
+
+define void @f(i32* %a, i32 %n) {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Verify that the "prefer-vector-width=256" attribute prevents the use of 512-bit
+; vectors
+
+; CHECK-LABEL: g:
+; CHECK: vmovdqu %ymm{{.}},
+; CHECK-NOT: %zmm
+
+; CHECK-PREFER-AVX256-LABEL: g:
+; CHECK-PREFER-AVX256: vmovdqu %ymm{{.}},
+; CHECK-PREFER-AVX256-NOT: %zmm
+
+define void @g(i32* %a, i32 %n) "prefer-vector-width"="256" {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; Verify that the "prefer-vector-width=512" attribute override the subtarget
+; vectors
+
+; CHECK-LABEL: h:
+; CHECK: vmovdqu64 %zmm{{.}},
+; CHECK-NOT: %ymm
+
+; CHECK-PREFER-AVX256-LABEL: h:
+; CHECK-PREFER-AVX256: vmovdqu64 %zmm{{.}},
+; CHECK-PREFER-AVX256-NOT: %ymm
+
+define void @h(i32* %a, i32 %n) "prefer-vector-width"="512" {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %n, i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
new file mode 100644
index 00000000000..f944d1f97da
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
@@ -0,0 +1,108 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR34965/D39346
+
+; LV retains the original scalar loop intact as remainder loop. However,
+; after this transformation, analysis information concerning the remainder
+; loop may differ from the original scalar loop. This test is an example of
+; that behaviour, where values inside the remainder loop which SCEV could
+; originally analyze now require flow-sensitive analysis currently not
+; supported in SCEV. In particular, during LV code generation, after turning
+; the original scalar loop into the remainder loop, LV expected
+; Legal->isConsecutivePtr() to be consistent and return the same output as
+; during legal/cost model phases (original scalar loop). Unfortunately, that
+; condition was not satisfied because of the aforementioned SCEV limitation.
+; After D39346, LV code generation doesn't rely on Legal->isConsecutivePtr(),
+; i.e., SCEV. This test verifies that LV is able to handle the described cases.
+;
+; TODO: The SCEV limitation described before may affect plans to further
+; optimize the remainder loop of this particular test case. One tentative
+; solution is to detect the problematic IVs in LV (%7 and %8) and perform an
+; in-place IV optimization by replacing:
+;   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ] with
+; with
+;   %8 = sub i32 %7, 1.
+
+
+; Verify that store is vectorized as stride-1 memory access.
+
+; CHECK-LABEL: @test_01(
+; CHECK-NOT: vector.body:
+
+; This test was originally vectorized, but now SCEV is smart enough to prove
+; that its trip count is 1, so it gets ignored by vectorizer.
+; Function Attrs: uwtable
+define void @test_01() {
+  br label %.outer
+
+; <label>:1:                                      ; preds = %2
+  ret void
+
+; <label>:2:                                      ; preds = %._crit_edge.loopexit
+  %3 = add nsw i32 %.ph, -2
+  br i1 undef, label %1, label %.outer
+
+.outer:                                           ; preds = %2, %0
+  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
+  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
+  %4 = and i32 %.ph, 30
+  %5 = add i32 %.ph2, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %6, %.outer
+  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
+  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
+  %9 = add i32 %8, 2
+  %10 = zext i32 %9 to i64
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
+  %12 = ashr i32 undef, %4
+  store i32 %12, i32 addrspace(1)* %11, align 4
+  %13 = add i32 %7, 1
+  %14 = icmp sgt i32 %13, 61
+  br i1 %14, label %._crit_edge.loopexit, label %6
+
+._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
+  br i1 undef, label %2, label %._crit_edge.loopexit
+}
+
+; After trip count is increased, the test gets vectorized.
+; CHECK-LABEL: @test_02(
+; CHECK: vector.body:
+; CHECK: store <4 x i32>
+
+; Function Attrs: uwtable
+define void @test_02() {
+  br label %.outer
+
+; <label>:1:                                      ; preds = %2
+  ret void
+
+; <label>:2:                                      ; preds = %._crit_edge.loopexit
+  %3 = add nsw i32 %.ph, -2
+  br i1 undef, label %1, label %.outer
+
+.outer:                                           ; preds = %2, %0
+  %.ph = phi i32 [ %3, %2 ], [ 336, %0 ]
+  %.ph2 = phi i32 [ 62, %2 ], [ 110, %0 ]
+  %4 = and i32 %.ph, 30
+  %5 = add i32 %.ph2, 1
+  br label %6
+
+; <label>:6:                                      ; preds = %6, %.outer
+  %7 = phi i32 [ %5, %.outer ], [ %13, %6 ]
+  %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
+  %9 = add i32 %8, 2
+  %10 = zext i32 %9 to i64
+  %11 = getelementptr inbounds i32, i32 addrspace(1)* undef, i64 %10
+  %12 = ashr i32 undef, %4
+  store i32 %12, i32 addrspace(1)* %11, align 4
+  %13 = add i32 %7, 1
+  %14 = icmp sgt i32 %13, 610
+  br i1 %14, label %._crit_edge.loopexit, label %6
+
+._crit_edge.loopexit:                             ; preds = %._crit_edge.loopexit, %6
+  br i1 undef, label %2, label %._crit_edge.loopexit
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
new file mode 100644
index 00000000000..e18159f2462
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -0,0 +1,67 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: PR31671
+;
+; Check a pointer in which one of its uses is consecutive-like and another of
+; its uses is non-consecutive-like. In the test case below, %tmp3 is the
+; pointer operand of an interleaved load, making it consecutive-like. However,
+; it is also the pointer operand of a non-interleaved store that will become a
+; scatter operation. %tmp3 (and the induction variable) should not be marked
+; uniform-after-vectorization.
+;
+; CHECK:       LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NOT:   LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+%data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
+
+define void @PR31671(float %x, %data* %d) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fmul float %x, %tmp1
+  %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+  %tmp4 = load float, float* %tmp3, align 4
+  %tmp5 = fadd float %tmp4, %tmp2
+  store float %tmp5, float* %tmp3, align 4
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, 32000
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="knl" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
new file mode 100644
index 00000000000..a73d91ba30b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mtriple=x86_64-- -o - %s | FileCheck %s
+
+; Testcase that verify that we don't get a faulty bitcast that cast between
+; different sizes.
+
+%rec8 = type { i16 }
+
+@a = global [1 x %rec8] zeroinitializer
+@b = global [2 x i16*] zeroinitializer
+
+
+define void @f1() {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  bb1:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> undef, i16 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], <i16 0, i16 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16*, i16** [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
+; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 0), i32 0, i32 0), i16* getelementptr inbounds (%rec8, %rec8* extractelement (<2 x %rec8*> getelementptr ([1 x %rec8], [1 x %rec8]* @a, <2 x i16> zeroinitializer, <2 x i64> zeroinitializer), i32 1), i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+
+bb1:
+  br label %bb2
+
+bb2:
+  %c.1.0 = phi i16 [ 0, %bb1 ], [ %_tmp9, %bb2 ]
+  %_tmp1 = zext i16 0 to i64
+  %_tmp2 = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 %_tmp1
+  %_tmp4 = bitcast %rec8* %_tmp2 to i16*
+  %_tmp6 = sext i16 %c.1.0 to i64
+  %_tmp7 = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 %_tmp6
+  store i16* %_tmp4, i16** %_tmp7
+  %_tmp9 = add nsw i16 %c.1.0, 1
+  %_tmp11 = icmp slt i16 %_tmp9, 2
+  br i1 %_tmp11, label %bb2, label %bb3
+
+bb3:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
new file mode 100644
index 00000000000..d75b1d940ae
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
@@ -0,0 +1,30 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize -dce -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+@A = common global [1024 x i32] zeroinitializer, align 16
+
+; We use to not vectorize this loop because the shift was deemed to expensive.
+; Now that we differentiate shift cost base on the operand value kind, we will
+; vectorize this loop.
+; CHECK: ashr <4 x i32>
+define void @f() {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %shl = ashr i32 %0, 3
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  store i32 %shl, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
new file mode 100644
index 00000000000..eb2a2a56fae
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK-LABEL: @conversion_cost1(
+;CHECK: store <32 x i8>
+;CHECK: ret
+define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 3
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
+  %2 = trunc i64 %indvars.iv to i8
+  %3 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  store i8 %2, i8* %3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+;CHECK-LABEL: @conversion_cost2(
+;CHECK: <2 x float>
+;CHECK: ret
+define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %add = add nsw i64 %indvars.iv, 3
+  %tofp = sitofp i64 %add to float
+  %gep = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  store float %tofp, float* %gep, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
new file mode 100644
index 00000000000..0ee2660ed31
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -0,0 +1,82 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@c = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; The program below gathers and scatters data. We better not vectorize it.
+;CHECK-LABEL: @cost_model_1(
+;CHECK-NOT: <2 x i32>
+;CHECK-NOT: <4 x i32>
+;CHECK-NOT: <8 x i32>
+;CHECK: ret void
+define void @cost_model_1() nounwind uwtable noinline ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 8
+  %idxprom1 = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx4, align 4
+  %idxprom5 = sext i32 %3 to i64
+  %arrayidx6 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %idxprom5
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; This function uses a stride that is generally too big to benefit from vectorization without
+; really good support for a gather load. We were not computing an accurate cost for the 
+; vectorization and subsequent scalarization of the pointer induction variables.
+
+define float @PR27826(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
+; CHECK-LABEL: @PR27826(
+; CHECK-NOT:   <4 x float> 
+; CHECK-NOT:   <8 x float> 
+; CHECK:       ret float %s.0.lcssa
+
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %preheader, label %for.end
+
+preheader:
+  %t0 = sext i32 %n to i64
+  br label %for
+
+for:
+  %indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for ]
+  %s.02 = phi float [ 0.0, %preheader ], [ %add4, %for ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %t1 = load float, float* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %t2 = load float, float* %arrayidx3, align 4
+  %add = fadd fast float %t1, %s.02
+  %add4 = fadd fast float %add, %t2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 32
+  %cmp1 = icmp slt i64 %indvars.iv.next, %t0
+  br i1 %cmp1, label %for, label %loopexit
+
+loopexit:
+  %add4.lcssa = phi float [ %add4, %for ]
+  br label %for.end
+
+for.end:
+  %s.0.lcssa = phi float [ 0.0, %entry ], [ %add4.lcssa, %loopexit ]
+  ret float %s.0.lcssa
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
new file mode 100644
index 00000000000..af4a48d3a22
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -0,0 +1,149 @@
+; RUN: opt < %s  -O3 -simplifycfg -keep-loops=false -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix AUTO_VEC %s
+
+; This test checks auto-vectorization with FP induction variable.
+; The FP operation is not "fast" and requires "fast-math" function attribute.
+
+;void fp_iv_loop1(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+
+; AUTO_VEC-LABEL: @fp_iv_loop1(
+; AUTO_VEC: vector.body
+; AUTO_VEC: store <8 x float>
+
+define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; The same as the previous, FP operation is not fast, different function attribute
+; Vectorization should be rejected.
+;void fp_iv_loop2(float * __restrict__ A, int N) {
+;  float x = 1.0;
+;  for (int i=0; i < N; ++i) {
+;    A[i] = x;
+;    x += 0.5;
+;  }
+;}
+
+; AUTO_VEC-LABEL: @fp_iv_loop2(
+; AUTO_VEC-NOT: vector.body
+; AUTO_VEC-NOT: store <{{.*}} x float>
+
+define void @fp_iv_loop2(float* noalias nocapture %A, i32 %N) #1 {
+entry:
+  %cmp4 = icmp sgt i32 %N, 0
+  br i1 %cmp4, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %x.06 = phi float [ %conv1, %for.body ], [ 1.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %x.06, float* %arrayidx, align 4
+  %conv1 = fadd float %x.06, 5.000000e-01
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; AUTO_VEC-LABEL: @external_use_with_fast_math(
+; AUTO_VEC-NEXT:  entry:
+; AUTO_VEC-NEXT:    [[TMP0:%.*]] = icmp sgt i64 %n, 1
+; AUTO_VEC-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %vector.ph
+; AUTO_VEC:       vector.ph:
+; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; AUTO_VEC:         br label %vector.body
+; AUTO_VEC:       middle.block:
+; AUTO_VEC:         [[TMP11:%.*]] = add nsw i64 [[N_VEC]], -1
+; AUTO_VEC-NEXT:    [[CAST_CMO:%.*]] = sitofp i64 [[TMP11]] to double
+; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00
+; AUTO_VEC-NEXT:    br i1 {{.*}}, label %for.end, label %for.body
+; AUTO_VEC:       for.end:
+; AUTO_VEC-NEXT:    [[J_LCSSA:%.*]] = phi double [ [[TMP12]], %middle.block ], [ %j, %for.body ]
+; AUTO_VEC-NEXT:    ret double [[J_LCSSA]]
+;
+define double @external_use_with_fast_math(double* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
+  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
+  %tmp0 = getelementptr double, double* %a, i64 %i
+  store double %j, double* %tmp0
+  %i.next = add i64 %i, 1
+  %j.next = fadd fast double %j, 3.0
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp1 = phi double [ %j, %for.body ]
+  ret double %tmp1
+}
+
+; AUTO_VEC-LABEL: @external_use_without_fast_math(
+; AUTO_VEC:       for.body:
+; AUTO_VEC:         [[J:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[J_NEXT:%.*]], %for.body ]
+; AUTO_VEC:         [[J_NEXT]] = fadd double [[J]], 3.000000e+00
+; AUTO_VEC:         br i1 {{.*}}, label %for.body, label %for.end
+; AUTO_VEC:       for.end:
+; AUTO_VEC-NEXT:    ret double [[J]]
+;
+define double @external_use_without_fast_math(double* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [%i.next, %for.body]
+  %j = phi double [ 0.0, %entry ], [ %j.next, %for.body ]
+  %tmp0 = getelementptr double, double* %a, i64 %i
+  store double %j, double* %tmp0
+  %i.next = add i64 %i, 1
+  %j.next = fadd double %j, 3.0
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp1 = phi double [ %j, %for.body ]
+  ret double %tmp1
+}
+
+attributes #0 = { "no-nans-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll b/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
new file mode 100644
index 00000000000..07b98b4fb00
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/force-ifcvt.ll
@@ -0,0 +1,42 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @Test(i32* nocapture %res, i32* nocapture readnone %c, i32* nocapture readonly %d, i32* nocapture readonly %p) #0 {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @Test
+; CHECK: <4 x i32>
+
+for.body:                                         ; preds = %cond.end, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %cond.end ]
+  %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !1
+  %cmp1 = icmp eq i32 %0, 0
+  %arrayidx3 = getelementptr inbounds i32, i32* %res, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx3, align 4, !llvm.access.group !1
+  br i1 %cmp1, label %cond.end, label %cond.false
+
+cond.false:                                       ; preds = %for.body
+  %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx7, align 4, !llvm.access.group !1
+  %add = add nsw i32 %2, %1
+  br label %cond.end
+
+cond.end:                                         ; preds = %for.body, %cond.false
+  %cond = phi i32 [ %add, %cond.false ], [ %1, %for.body ]
+  store i32 %cond, i32* %arrayidx3, align 4, !llvm.access.group !1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %cond.end
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+
+!0 = distinct !{!0, !{!"llvm.loop.parallel_accesses", !1}}
+!1 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
new file mode 100644
index 00000000000..4a56d6b5ebf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp32_to_uint32-cost-model.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+@float_array = common global [10000 x float] zeroinitializer, align 16
+@unsigned_array = common global [10000 x i32] zeroinitializer, align 16
+
+; If we need to scalarize the fptoui and then use inserts to build up the
+; vector again, then there is certainly no value in going 256-bit wide.
+; CHECK-NOT: vinserti128
+
+define void @convert(i32 %N) {
+entry:
+  %0 = icmp eq i32 %N, 0
+  br i1 %0, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [10000 x float], [10000 x float]* @float_array, i64 0, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %conv = fptoui float %1 to i32
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
new file mode 100644
index 00000000000..c066afcfa63
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp64_to_uint32-cost-model.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -mcpu=core-avx2 -loop-vectorize -S | llc -mcpu=core-avx2 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+@n = global i32 10000, align 4
+@double_array = common global [10000 x double] zeroinitializer, align 16
+@unsigned_array = common global [10000 x i32] zeroinitializer, align 16
+
+; If we need to scalarize the fptoui and then use inserts to build up the
+; vector again, then there is certainly no value in going 256-bit wide.
+; CHECK-NOT: vpinsrd
+
+define void @convert() {
+entry:
+  %0 = load i32, i32* @n, align 4
+  %cmp4 = icmp eq i32 %0, 0
+  br i1 %cmp4, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [10000 x double], [10000 x double]* @double_array, i64 0, i64 %indvars.iv
+  %1 = load double, double* %arrayidx, align 8
+  %conv = fptoui double %1 to i32
+  %arrayidx2 = getelementptr inbounds [10000 x i32], [10000 x i32]* @unsigned_array, i64 0, i64 %indvars.iv
+  store i32 %conv, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp ult i32 %2, %0
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
new file mode 100644
index 00000000000..b3a0710545d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp_to_sint8-cost-model.ll
@@ -0,0 +1,25 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+; CHECK: cost of 7 for VF 8 For instruction:   %conv = fptosi float %tmp to i8
+define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp = load float, float* %arrayidx, align 4
+  %conv = fptosi float %tmp to i8
+  %arrayidx2 = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  store i8 %conv, i8* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
new file mode 100644
index 00000000000..88f15e7e148
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+define void @test1() #0 personality i32 (...)* @__CxxFrameHandler3 {
+entry:
+  invoke void @_CxxThrowException(i8* null, i8* null)
+          to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  catchret from %1 to label %try.cont
+
+for.body:                                         ; preds = %for.body, %catch
+  %i.07 = phi i32 [ 0, %catch ], [ %inc, %for.body ]
+  %call = call double @floor(double 1.0) #1 [ "funclet"(token %1) ]
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+try.cont:                                         ; preds = %for.cond.cleanup
+  ret void
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
+; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
+
+declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare double @floor(double) #1
+
+attributes #0 = { "target-features"="+sse2" }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
new file mode 100644
index 00000000000..fd4981eb04a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather-cost.ll
@@ -0,0 +1,86 @@
+; RUN: opt -loop-vectorize -mtriple=x86_64-apple-macosx -S -mcpu=corei7-avx -enable-interleaved-mem-accesses=false < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@kernel = global [512 x float] zeroinitializer, align 16
+@kernel2 = global [512 x float] zeroinitializer, align 16
+@kernel3 = global [512 x float] zeroinitializer, align 16
+@kernel4 = global [512 x float] zeroinitializer, align 16
+@src_data = global [1536 x float] zeroinitializer, align 16
+@r_ = global i8 0, align 1
+@g_ = global i8 0, align 1
+@b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive. This function represents a point where vectorization starts to
+; become beneficial.
+; Make sure we are conservative and don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+  %cmp53 = icmp eq i64 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i64 %v.055, %offset
+  %mul = mul i64 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float, float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i64 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float, float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i64 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float, float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i64 %v.055, 1
+  %exitcond = icmp ne i64 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 1
+  store i8 %g.0.lcssa, i8* @g_, align 1
+  store i8 %b.0.lcssa, i8* @b_, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
new file mode 100644
index 00000000000..f38782cb7e5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mattr=avx512f  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
new file mode 100644
index 00000000000..21ed2f2fb39
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -0,0 +1,1754 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+; RUN: opt < %s -O3 -mcpu=knl -force-vector-width=2 -S | FileCheck %s -check-prefix=FVW2
+
+; With a force-vector-width, it is sometimes more profitable to generate
+; scalarized and predicated stores instead of masked scatter.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
+;
+;  for (int i=0; i < SIZE; ++i) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[index[i]] + (float) 0.5;
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64>
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]])
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64>
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP17:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]])
+; AVX512-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64>
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]])
+; AVX512-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4
+; AVX512-NEXT:    [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> undef)
+; AVX512-NEXT:    [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64>
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]])
+; AVX512-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
+; AVX512-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo1(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
+; FVW2-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
+; FVW2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP5]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP6]], i32 4, <2 x i1> [[TMP2]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
+; FVW2-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP7]], <2 x float>* [[TMP9]], i32 4, <2 x i1> [[TMP2]])
+; FVW2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 2
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP11]], align 4
+; FVW2-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer
+; FVW2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64>
+; FVW2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
+; FVW2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]])
+; FVW2-NEXT:    [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
+; FVW2-NEXT:    [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer
+; FVW2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64>
+; FVW2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP27:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
+; FVW2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP27]], <2 x float>* [[TMP29]], i32 4, <2 x i1> [[TMP22]])
+; FVW2-NEXT:    [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 6
+; FVW2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP31]], align 4
+; FVW2-NEXT:    [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
+; FVW2-NEXT:    [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* nonnull [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef)
+; FVW2-NEXT:    [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64>
+; FVW2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP37:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
+; FVW2-NEXT:    [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
+; FVW2-NEXT:    call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP37]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP32]])
+; FVW2-NEXT:    [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
+; FVW2-NEXT:    br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca float*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %in, float** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %index.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %idxprom4 = sext i32 %6 to i64
+  %7 = load float*, float** %in.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
+  %8 = load float, float* %arrayidx5, align 4
+  %add = fadd float %8, 5.000000e-01
+  %9 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %9 to i64
+  %10 = load float*, float** %out.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; i += 16) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+%struct.In = type { float, float }
+
+define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+; AVX512-LABEL: @foo2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !2
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float*, float** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
+  store float %add, float* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;struct Out {
+;  float a;
+;  float b;
+;};
+;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; i += 16) {
+;    if (trigger[i] > 0) {
+;      out[i].b = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+%struct.Out = type { float, float }
+
+define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
+; AVX512-LABEL: @foo3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo3(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD6]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER9]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD6]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER12]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; FVW2:       pred.store.if16:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP20]], i32 1
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE17]]
+; FVW2:       pred.store.continue17:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; FVW2:       pred.store.if18:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP24]], i32 1
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE19]]
+; FVW2:       pred.store.continue19:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; FVW2:       pred.store.if20:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP28]], i32 1
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE21]]
+; FVW2:       pred.store.continue21:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; FVW2:       pred.store.if22:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP32]], i32 1
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE23]]
+; FVW2:       pred.store.continue23:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; FVW2:       pred.store.if24:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP36]], i32 1
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE25]]
+; FVW2:       pred.store.continue25:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; FVW2:       pred.store.if26:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP40]], i32 1
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE27]]
+; FVW2:       pred.store.continue27:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29]]
+; FVW2:       pred.store.if28:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP44]], i32 1
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE29]]
+; FVW2:       pred.store.continue29:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !3
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca %struct.Out*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store %struct.Out* %out, %struct.Out** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
+  %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
+  store float %add, float* %b6, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+
+; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
+
+define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+; AVX512-LABEL: @foo2_addrspace(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the input has the non-default address space.
+
+define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo2_addrspace2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace2(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(0)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
+  store float %add, float addrspace(0)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the output has the non-default address space.
+
+define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+; AVX512-LABEL: @foo2_addrspace3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer
+; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 256, i64 272, i64 288, i64 304, i64 320, i64 336, i64 352, i64 368, i64 384, i64 400, i64 416, i64 432, i64 448, i64 464, i64 480, i64 496>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]])
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 512, i64 528, i64 544, i64 560, i64 576, i64 592, i64 608, i64 624, i64 640, i64 656, i64 672, i64 688, i64 704, i64 720, i64 736, i64 752>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]])
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 768, i64 784, i64 800, i64 816, i64 832, i64 848, i64 864, i64 880, i64 896, i64 912, i64 928, i64 944, i64 960, i64 976, i64 992, i64 1008>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]])
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1024, i64 1040, i64 1056, i64 1072, i64 1088, i64 1104, i64 1120, i64 1136, i64 1152, i64 1168, i64 1184, i64 1200, i64 1216, i64 1232, i64 1248, i64 1264>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]])
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1280, i64 1296, i64 1312, i64 1328, i64 1344, i64 1360, i64 1376, i64 1392, i64 1408, i64 1424, i64 1440, i64 1456, i64 1472, i64 1488, i64 1504, i64 1520>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1536, i64 1552, i64 1568, i64 1584, i64 1600, i64 1616, i64 1632, i64 1648, i64 1664, i64 1680, i64 1696, i64 1712, i64 1728, i64 1744, i64 1760, i64 1776>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]])
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 1792, i64 1808, i64 1824, i64 1840, i64 1856, i64 1872, i64 1888, i64 1904, i64 1920, i64 1936, i64 1952, i64 1968, i64 1984, i64 2000, i64 2016, i64 2032>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]])
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2048, i64 2064, i64 2080, i64 2096, i64 2112, i64 2128, i64 2144, i64 2160, i64 2176, i64 2192, i64 2208, i64 2224, i64 2240, i64 2256, i64 2272, i64 2288>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]])
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer
+; AVX512-NEXT:    [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2304, i64 2320, i64 2336, i64 2352, i64 2368, i64 2384, i64 2400, i64 2416, i64 2432, i64 2448, i64 2464, i64 2480, i64 2496, i64 2512, i64 2528, i64 2544>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2560, i64 2576, i64 2592, i64 2608, i64 2624, i64 2640, i64 2656, i64 2672, i64 2688, i64 2704, i64 2720, i64 2736, i64 2752, i64 2768, i64 2784, i64 2800>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]])
+; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 2816, i64 2832, i64 2848, i64 2864, i64 2880, i64 2896, i64 2912, i64 2928, i64 2944, i64 2960, i64 2976, i64 2992, i64 3008, i64 3024, i64 3040, i64 3056>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]])
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3072, i64 3088, i64 3104, i64 3120, i64 3136, i64 3152, i64 3168, i64 3184, i64 3200, i64 3216, i64 3232, i64 3248, i64 3264, i64 3280, i64 3296, i64 3312>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]])
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer
+; AVX512-NEXT:    [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3328, i64 3344, i64 3360, i64 3376, i64 3392, i64 3408, i64 3424, i64 3440, i64 3456, i64 3472, i64 3488, i64 3504, i64 3520, i64 3536, i64 3552, i64 3568>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]])
+; AVX512-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer
+; AVX512-NEXT:    [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3584, i64 3600, i64 3616, i64 3632, i64 3648, i64 3664, i64 3680, i64 3696, i64 3712, i64 3728, i64 3744, i64 3760, i64 3776, i64 3792, i64 3808, i64 3824>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]])
+; AVX512-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT:    [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer
+; AVX512-NEXT:    [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>, i32 1
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef)
+; AVX512-NEXT:    [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> <i64 3840, i64 3856, i64 3872, i64 3888, i64 3904, i64 3920, i64 3936, i64 3952, i64 3968, i64 3984, i64 4000, i64 4016, i64 4032, i64 4048, i64 4064, i64 4080>
+; AVX512-NEXT:    call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]])
+; AVX512-NEXT:    ret void
+;
+; FVW2-LABEL: @foo2_addrspace3(
+; FVW2-NEXT:  entry:
+; FVW2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FVW2:       vector.body:
+; FVW2-NEXT:    [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; FVW2-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 16>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ]
+; FVW2-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 32, i64 32>
+; FVW2-NEXT:    [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 64, i64 64>
+; FVW2-NEXT:    [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 96, i64 96>
+; FVW2-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4
+; FVW2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]]
+; FVW2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]]
+; FVW2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]]
+; FVW2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]]
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+; FVW2-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; FVW2-NEXT:    [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer
+; FVW2-NEXT:    [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer
+; FVW2-NEXT:    [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer
+; FVW2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1
+; FVW2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1
+; FVW2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1
+; FVW2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef)
+; FVW2-NEXT:    [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef)
+; FVW2-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
+; FVW2-NEXT:    [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; FVW2-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FVW2:       pred.store.if:
+; FVW2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]]
+; FVW2-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; FVW2-NEXT:    store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FVW2:       pred.store.continue:
+; FVW2-NEXT:    [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; FVW2-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; FVW2:       pred.store.if17:
+; FVW2-NEXT:    [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16
+; FVW2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]]
+; FVW2-NEXT:    [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; FVW2-NEXT:    store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; FVW2:       pred.store.continue18:
+; FVW2-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; FVW2-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; FVW2:       pred.store.if19:
+; FVW2-NEXT:    [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32
+; FVW2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]]
+; FVW2-NEXT:    [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
+; FVW2-NEXT:    store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; FVW2:       pred.store.continue20:
+; FVW2-NEXT:    [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; FVW2-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; FVW2:       pred.store.if21:
+; FVW2-NEXT:    [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48
+; FVW2-NEXT:    [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]]
+; FVW2-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
+; FVW2-NEXT:    store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; FVW2:       pred.store.continue22:
+; FVW2-NEXT:    [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
+; FVW2-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; FVW2:       pred.store.if23:
+; FVW2-NEXT:    [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64
+; FVW2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]]
+; FVW2-NEXT:    [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
+; FVW2-NEXT:    store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; FVW2:       pred.store.continue24:
+; FVW2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
+; FVW2-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; FVW2:       pred.store.if25:
+; FVW2-NEXT:    [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80
+; FVW2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]]
+; FVW2-NEXT:    [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1
+; FVW2-NEXT:    store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; FVW2:       pred.store.continue26:
+; FVW2-NEXT:    [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; FVW2-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; FVW2:       pred.store.if27:
+; FVW2-NEXT:    [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96
+; FVW2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]]
+; FVW2-NEXT:    [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; FVW2-NEXT:    store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; FVW2:       pred.store.continue28:
+; FVW2-NEXT:    [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; FVW2-NEXT:    br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.if29:
+; FVW2-NEXT:    [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112
+; FVW2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]]
+; FVW2-NEXT:    [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; FVW2-NEXT:    store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4
+; FVW2-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; FVW2:       pred.store.continue30:
+; FVW2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
+; FVW2-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 128, i64 128>
+; FVW2-NEXT:    [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; FVW2-NEXT:    br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; FVW2:       for.end:
+; FVW2-NEXT:    ret void
+;
+entry:
+  %in.addr = alloca %struct.In addrspace(0)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(0)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
new file mode 100644
index 00000000000..c581f4bf2a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -0,0 +1,77 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-interleave=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK-LABEL: @example1(
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+
+;UNROLL-LABEL: @example1(
+;UNROLL: load <4 x i32>
+;UNROLL: load <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: add nsw <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
+;CHECK-LABEL: @example10b(
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+;UNROLL-LABEL: @example10b(
+;UNROLL: load <4 x i16>
+;UNROLL: load <4 x i16>
+;UNROLL: store <4 x i32>
+;UNROLL: store <4 x i32>
+;UNROLL: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16, i16* %sb, i64 %indvars.iv
+  %3 = load i16, i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32, i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
new file mode 100644
index 00000000000..683e857e5f2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PR15794
+; incorrect addition of llvm.mem.parallel_loop_access metadata is undefined
+; behaviour. Vectorizer ignores the memory dependency checks and goes ahead and
+; vectorizes this loop with uniform stores which has an output dependency.
+
+; void foo(int *a, int *b, int k, int m) {
+;   for (int i = 0; i < m; i++) {
+;     for (int j = 0; j < m; j++) {
+;       a[i] = a[i + j + k] + 1; <<<
+;     }
+;     b[i] = b[i] + 3;
+;   }
+; }
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP9]], [[K]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP8]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, i1 [[TMP12]], i1 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 false, [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[ADD_US]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP24]], i32 0
+; CHECK-NEXT:    store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP24]], i32 1
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP24]], i32 2
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
+; CHECK-NEXT:    store i32 [[TMP28]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4, !llvm.mem.parallel_loop_access !3
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
+; Same test as above, but without the invalid parallel_loop_access metadata.
+
+; Here we can see the vectorizer does the mem dep checks and decides it is
+; unsafe to vectorize.
+define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+; CHECK-LABEL: @no-par-mem-metadata(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]]
+; CHECK:       for.body3.lr.ph.us.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
+; CHECK:       for.end.us:
+; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3
+; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
+; CHECK-NEXT:    [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
+; CHECK:       for.body3.us:
+; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ 0, [[FOR_BODY3_LR_PH_US]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !1
+; CHECK:       for.body3.lr.ph.us:
+; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
+; CHECK-NEXT:    br label [[FOR_BODY3_US]]
+; CHECK:       for.end15.loopexit:
+; CHECK-NEXT:    br label [[FOR_END15]]
+; CHECK:       for.end15:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32, i32* %b, i64 %indvars.iv33
+  %0 = load i32, i32* %arrayidx9.us, align 4
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32, i32* %a, i64 %idxprom.us
+  %2 = load i32, i32* %arrayidx.us, align 4
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32, i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!3 = !{!4, !5}
+!4 = !{!4}
+!5 = !{!5}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
new file mode 100644
index 00000000000..f9ccbf146fc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin %s | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt -S -loop-vectorize -mtriple=x86_64-apple-darwin -mattr=+avx %s | FileCheck %s --check-prefixes=CHECK,AVX
+
+; Two mostly identical functions. The only difference is the presence of
+; fast-math flags on the second. The loop is a pretty simple reduction:
+
+; for (int i = 0; i < 32; ++i)
+;   if (arr[i] != 42)
+;     tot += arr[i];
+
+define double @sumIfScalar(double* nocapture readonly %arr) {
+; CHECK-LABEL: @sumIfScalar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; CHECK-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
+; CHECK-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; CHECK-NEXT:    [[TST:%.*]] = fcmp une double [[NEXTVAL]], 4.200000e+01
+; CHECK-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; CHECK:       do.add:
+; CHECK-NEXT:    [[TOT_NEW:%.*]] = fadd double [[TOT]], [[NEXTVAL]]
+; CHECK-NEXT:    br label [[NEXT_ITER]]
+; CHECK:       no.add:
+; CHECK-NEXT:    br label [[NEXT_ITER]]
+; CHECK:       next.iter:
+; CHECK-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; CHECK-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
+; CHECK:       done:
+; CHECK-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
+; CHECK-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.next, %next.iter]
+  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
+
+  %addr = getelementptr double, double* %arr, i32 %i
+  %nextval = load double, double* %addr
+
+  %tst = fcmp une double %nextval, 42.0
+  br i1 %tst, label %do.add, label %no.add
+
+do.add:
+  %tot.new = fadd double %tot, %nextval
+  br label %next.iter
+
+no.add:
+  br label %next.iter
+
+next.iter:
+  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
+  %i.next = add i32 %i, 1
+  %again = icmp ult i32 %i.next, 32
+  br i1 %again, label %loop, label %done
+
+done:
+  ret double %tot.next
+}
+
+define double @sumIfVector(double* nocapture readonly %arr) {
+; SSE-LABEL: @sumIfVector(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    br label [[LOOP:%.*]]
+; SSE:       loop:
+; SSE-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; SSE-NEXT:    [[TOT:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; SSE-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[I]]
+; SSE-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; SSE-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
+; SSE-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; SSE:       do.add:
+; SSE-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
+; SSE-NEXT:    br label [[NEXT_ITER]]
+; SSE:       no.add:
+; SSE-NEXT:    br label [[NEXT_ITER]]
+; SSE:       next.iter:
+; SSE-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; SSE-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; SSE-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; SSE-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE:%.*]]
+; SSE:       done:
+; SSE-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ]
+; SSE-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+; AVX-LABEL: @sumIfVector(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
+; AVX-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr double, double* [[ARR:%.*]], i32 [[TMP0]]
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr double, double* [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast double* [[TMP2]] to <4 x double>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP3]], align 8
+; AVX-NEXT:    [[TMP4:%.*]] = fcmp fast une <4 x double> [[WIDE_LOAD]], <double 4.200000e+01, double 4.200000e+01, double 4.200000e+01, double 4.200000e+01>
+; AVX-NEXT:    [[TMP5:%.*]] = fadd fast <4 x double> [[VEC_PHI]], [[WIDE_LOAD]]
+; AVX-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; AVX-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
+; AVX-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; AVX-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
+; AVX-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; AVX:       middle.block:
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[PREDPHI]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[PREDPHI]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i32 32, 32
+; AVX-NEXT:    br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]]
+; AVX:       scalar.ph:
+; AVX-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[LOOP:%.*]]
+; AVX:       loop:
+; AVX-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ]
+; AVX-NEXT:    [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ]
+; AVX-NEXT:    [[ADDR:%.*]] = getelementptr double, double* [[ARR]], i32 [[I]]
+; AVX-NEXT:    [[NEXTVAL:%.*]] = load double, double* [[ADDR]]
+; AVX-NEXT:    [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01
+; AVX-NEXT:    br i1 [[TST]], label [[DO_ADD:%.*]], label [[NO_ADD:%.*]]
+; AVX:       do.add:
+; AVX-NEXT:    [[TOT_NEW:%.*]] = fadd fast double [[TOT]], [[NEXTVAL]]
+; AVX-NEXT:    br label [[NEXT_ITER]]
+; AVX:       no.add:
+; AVX-NEXT:    br label [[NEXT_ITER]]
+; AVX:       next.iter:
+; AVX-NEXT:    [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
+; AVX-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; AVX-NEXT:    [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
+; AVX-NEXT:    br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop !2
+; AVX:       done:
+; AVX-NEXT:    [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    ret double [[TOT_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i.next, %next.iter]
+  %tot = phi double [0.0, %entry], [%tot.next, %next.iter]
+
+  %addr = getelementptr double, double* %arr, i32 %i
+  %nextval = load double, double* %addr
+
+  %tst = fcmp fast une double %nextval, 42.0
+  br i1 %tst, label %do.add, label %no.add
+
+do.add:
+  %tot.new = fadd fast double %tot, %nextval
+  br label %next.iter
+
+no.add:
+  br label %next.iter
+
+next.iter:
+  %tot.next = phi double [%tot, %no.add], [%tot.new, %do.add]
+  %i.next = add i32 %i, 1
+  %again = icmp ult i32 %i.next, 32
+  br i1 %again, label %loop, label %done
+
+done:
+  ret double %tot.next
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
new file mode 100644
index 00000000000..4d7c0b6f64b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; This test checks that gather/scatter not used for i128 data type.
+;CHECK-NOT: gather
+;CHECK-NOT: scatter
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = common global [151 x i128] zeroinitializer, align 16
+@.str = private unnamed_addr constant [46 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\0A\00", align 1
+@.str.1 = private unnamed_addr constant [44 x i8] c" FAIL.....Y3 1/1 (BUBBLE SORT), X(25) = %d\0A\00", align 1
+@str = private unnamed_addr constant [45 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\00"
+
+; Function Attrs: noinline nounwind uwtable
+declare i32 @y3inner() #0 
+
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %j.0 = phi i128 [ 99999, %entry ], [ %add10, %do.body ]
+  %i.0 = phi i128 [ 1, %entry ], [ %add11, %do.body ]
+  %and = and i128 %j.0, 32767
+  %idxprom = trunc i128 %i.0 to i64
+  %arrayidx = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom
+  store i128 %and, i128* %arrayidx, align 16
+  %add = add nuw nsw i128 %j.0, 11111
+  %and1 = and i128 %add, 32767
+  %add2 = add nuw nsw i128 %i.0, 1
+  %idxprom3 = trunc i128 %add2 to i64
+  %arrayidx4 = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom3
+  store i128 %and1, i128* %arrayidx4, align 16
+  %add5 = add nuw nsw i128 %j.0, 22222
+  %and6 = and i128 %add5, 32767
+  %add7 = add nuw nsw i128 %i.0, 2
+  %idxprom8 = trunc i128 %add7 to i64
+  %arrayidx9 = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 %idxprom8
+  store i128 %and6, i128* %arrayidx9, align 16
+  %add10 = add nuw nsw i128 %j.0, 33333
+  %add11 = add nuw nsw i128 %i.0, 3
+  %cmp = icmp slt i128 %add11, 149
+  br i1 %cmp, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 149), align 16
+  store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 150), align 16
+  %call = tail call i32 @y3inner()
+  %0 = load i128, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 25), align 16
+  %cmp12 = icmp eq i128 %0, 5085
+  br i1 %cmp12, label %if.then, label %if.else
+
+if.then:                                          ; preds = %do.end
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @str, i64 0, i64 0))
+  br label %if.end
+
+if.else:                                          ; preds = %do.end
+  %coerce.sroa.0.0.extract.trunc = trunc i128 %0 to i64
+  %coerce.sroa.2.0.extract.shift = lshr i128 %0, 64
+  %coerce.sroa.2.0.extract.trunc = trunc i128 %coerce.sroa.2.0.extract.shift to i64
+  %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([44 x i8], [44 x i8]* @.str.1, i64 0, i64 0), i64 %coerce.sroa.0.0.extract.trunc, i64 %coerce.sroa.2.0.extract.trunc)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @printf(i8*, ...) #1
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #2
+
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
new file mode 100644
index 00000000000..15ec344cc3d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -loop-vectorize -mtriple x86_64 -S | FileCheck %s
+
+%struct.ST4 = type { i32, i32, i32, i32 }
+
+; The gaps between the memory access in this function are too large for
+; interleaving.
+
+; Test from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=7560
+define void @test1(%struct.ST4* noalias %B) {
+; CHECK-LABEL: @test1
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label %for.body
+
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %p1 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
+  store i32 65536, i32* %p1, align 4
+  %p2 = getelementptr i32, i32* %p1, i32 -2147483648
+  store i32 65536, i32* %p2, align 4
+  %p3 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
+  store i32 10, i32* %p3, align 4
+  %p4 = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
+  store i32 12, i32* %p4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
new file mode 100644
index 00000000000..9294c92b575
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=NORMAL
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
+
+; NORMAL-LABEL: foo
+; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
+; NORMAL: %[[STRIDED1:.*]] = shufflevector <8 x i32> %[[WIDE]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]]
+
+; ATOM-LABEL: foo
+; ATOM: load i32
+; ATOM: load i32
+; ATOM: store i32
+define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 %2
+  %3 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %3, %1
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
new file mode 100644
index 00000000000..cf6cc1356e0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mattr=avx512f -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_load_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1>
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[A]], null
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[LATCH]], label [[COND_LOAD:%.*]]
+; CHECK:       cond_load:
+; CHECK-NEXT:    [[ALOAD:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7
+; CHECK:       for.end:
+; CHECK-NEXT:    [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA_LCSSA]]
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp ne i32* %a, null
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_load, label %latch
+
+cond_load:
+  %aload = load i32, i32* %a, align 4
+  br label %latch
+
+latch:
+  %a.lcssa = phi i32 [ %aload, %cond_load ], [ 1, %for.body ]
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 %a.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
new file mode 100644
index 00000000000..69f578cf789
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S -mattr=avx512f  -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; first test checks that loop with a reduction and a uniform store gets
+; vectorized.
+; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction
+; CHECK-LABEL: vector.memcheck:
+; CHECK:    found.conflict
+
+; CHECK-LABEL: vector.body:
+; CHECK:         %vec.phi = phi <16 x i32>  [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         %wide.load = load <16 x i32>
+; CHECK:         [[ADD]] = add <16 x i32> %vec.phi, %wide.load
+; CHECK:         store i32 %ntrunc, i32* %a
+; CHECK-NOT:     store i32 %ntrunc, i32* %a
+; CHECK:         %index.next = add i64 %index, 64
+
+; CHECK-LABEL: middle.block:
+; CHECK:         %rdx.shuf = shufflevector <16 x i32>
+define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %tmp3 = add i32 %tmp0, %tmp2
+  store i32 %ntrunc, i32* %a
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %tmp4 = phi i32 [ %tmp3, %for.body ]
+  ret i32 %tmp4
+}
+
+; Conditional store
+; if (b[i] == k) a = ntrunc
+define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
+; CHECK-LABEL: @inv_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT5]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT9]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP5]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT8]], <16 x i32*> [[BROADCAST_SPLAT10]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  store i32 %ntrunc, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) {
+; CHECK-LABEL: @variant_val_store_to_inv_address_conditional(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8*
+; CHECK-NEXT:    [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
+; CHECK-NEXT:    [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8*
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[N]], 1
+; CHECK-NEXT:    [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i32, i32* [[C]], i64 [[SMAX2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND08:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[B]]
+; CHECK-NEXT:    [[BOUND19:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
+; CHECK-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT10]]
+; CHECK-NEXT:    [[BOUND012:%.*]] = icmp ugt i32* [[SCEVGEP6]], [[A]]
+; CHECK-NEXT:    [[BOUND113:%.*]] = icmp ugt i8* [[UGLYGEP]], [[C5]]
+; CHECK-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; CHECK-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT16]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT18]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT21:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT20]], <16 x i32*> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT17]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; CHECK-NEXT:    store <16 x i32> [[BROADCAST_SPLAT19]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !21
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT21]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP2]], [[K]]
+; CHECK-NEXT:    store i32 [[NTRUNC]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_STORE:%.*]], label [[LATCH]]
+; CHECK:       cond_store:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[I]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 8
+; CHECK-NEXT:    store i32 [[TMP4]], i32* [[A]], align 4
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ntrunc = trunc i64 %n to i32
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp1, align 8
+  %cmp = icmp eq i32 %tmp2, %k
+  store i32 %ntrunc, i32* %tmp1
+  br i1 %cmp, label %cond_store, label %latch
+
+cond_store:
+  %tmp3 = getelementptr inbounds i32, i32* %c, i64 %i
+  %tmp4 = load i32, i32* %tmp3, align 8
+  store i32 %tmp4, i32* %a
+  br label %latch
+
+latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/X86/lit.local.cfg
new file mode 100644
index 00000000000..e71f3cc4c41
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
new file mode 100644
index 00000000000..419cedbe145
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -0,0 +1,3374 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s  -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(int *A, int *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo1(i32* %A, i32* %B, i32* %trigger) {
+; AVX1-LABEL: @foo1(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX1-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !3
+; AVX1-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP5]], <8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !5, !noalias !7
+; AVX1-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP13]], <8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX1-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10
+;
+; AVX2-LABEL: @foo1(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 8
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 24
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 8
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 24
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX2-NEXT:    [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX2-NEXT:    [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP20]], <8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 8
+; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP21]], <8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP22]], <8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 24
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP23]], <8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP33]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP37]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 24
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP39]], align 4, !alias.scope !0
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 8
+; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 24
+; AVX2-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* nonnull [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3
+; AVX2-NEXT:    [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX2-NEXT:    [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX2-NEXT:    [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX2-NEXT:    [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP52]], <8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 8
+; AVX2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP53]], <8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
+; AVX2-NEXT:    [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP54]], <8 x i32>* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 24
+; AVX2-NEXT:    [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP55]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
+; AVX2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
+; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX2-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
+;
+; AVX512-LABEL: @foo1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 32
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 48
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 32
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 48
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX512-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX512-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP20]], <16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP21]], <16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 32
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP22]], <16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i64 48
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP23]], <16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP33]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 32
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP37]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i64 48
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP39]], align 4, !alias.scope !0
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 16
+; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 32
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP44]], i64 48
+; AVX512-NEXT:    [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* nonnull [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3
+; AVX512-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX512-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX512-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX512-NEXT:    [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP52]], <16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 16
+; AVX512-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP53]], <16 x i32>* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 32
+; AVX512-NEXT:    [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP54]], <16 x i32>* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i64 48
+; AVX512-NEXT:    [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP55]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !5, !noalias !7
+; AVX512-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
+; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX512-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10
+;
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
+  store i32 %add, i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The same as @foo1 but all the pointers are address space 1 pointers.
+
+; Function Attrs: nounwind uwtable
+define void @foo1_addrspace1(i32 addrspace(1)*  %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) {
+; AVX1-LABEL: @foo1_addrspace1(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX1-NEXT:    [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32 addrspace(1)* [[TMP3]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !14
+; AVX1-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP5]], <8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !16, !noalias !18
+; AVX1-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP12:%.*]] = bitcast i32 addrspace(1)* [[TMP11]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP13]], <8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX1-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP17:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP19:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP20:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20
+;
+; AVX2-LABEL: @foo1_addrspace1(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 8
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 24
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 8
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 24
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX2-NEXT:    [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX2-NEXT:    [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP20]], <8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 8
+; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP21]], <8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP22]], <8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 24
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP23]], <8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 24
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
+; AVX2-NEXT:    [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 8
+; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 24
+; AVX2-NEXT:    [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !14
+; AVX2-NEXT:    [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX2-NEXT:    [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX2-NEXT:    [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX2-NEXT:    [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX2-NEXT:    [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP52]], <8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 8
+; AVX2-NEXT:    [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP53]], <8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
+; AVX2-NEXT:    [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP54]], <8 x i32> addrspace(1)* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 24
+; AVX2-NEXT:    [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP55]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
+; AVX2-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64
+; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX2-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
+;
+; AVX512-LABEL: @foo1_addrspace1(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]]
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 32
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP0]], i64 48
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 32
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP12]], i64 48
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]]
+; AVX512-NEXT:    [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]]
+; AVX512-NEXT:    [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]]
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP20]], <16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP21]], <16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 32
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP22]], <16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP24]], i64 48
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP23]], <16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 32
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP32]], i64 48
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11
+; AVX512-NEXT:    [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 16
+; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 32
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP44]], i64 48
+; AVX512-NEXT:    [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !14
+; AVX512-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]]
+; AVX512-NEXT:    [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]]
+; AVX512-NEXT:    [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]]
+; AVX512-NEXT:    [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]]
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]]
+; AVX512-NEXT:    [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP52]], <16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 16
+; AVX512-NEXT:    [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP53]], <16 x i32> addrspace(1)* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 32
+; AVX512-NEXT:    [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP54]], <16 x i32> addrspace(1)* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP56]], i64 48
+; AVX512-NEXT:    [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP55]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !16, !noalias !18
+; AVX512-NEXT:    [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128
+; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984
+; AVX512-NEXT:    br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20
+;
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 8
+  %B.addr = alloca i32 addrspace(1)*, align 8
+  %trigger.addr = alloca i32 addrspace(1)*, align 8
+  %i = alloca i32, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 8
+  store i32 addrspace(1)* %B, i32 addrspace(1)** %B.addr, align 8
+  store i32 addrspace(1)* %trigger, i32 addrspace(1)** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom2
+  %6 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32 addrspace(1)*, i32 addrspace(1)** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %8, i64 %idxprom4
+  %9 = load i32, i32 addrspace(1)* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %11, i64 %idxprom6
+  store i32 %add, i32 addrspace(1)* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo2(float *A, float *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo2(float* %A, float* %B, i32* %trigger) {
+; AVX1-LABEL: @foo2(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
+; AVX1-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX1-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
+; AVX1-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
+; AVX1-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX1-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
+; AVX1-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
+; AVX1-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
+; AVX1-NEXT:    [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX1-NEXT:    [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX1-NEXT:    [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX1-NEXT:    [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
+; AVX1-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX1-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
+; AVX1-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX1-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX1:       for.body.preheader:
+; AVX1-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+; AVX2-LABEL: @foo2(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 8
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 24
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24
+; AVX2-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX2-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float>
+; AVX2-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float>
+; AVX2-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float>
+; AVX2-NEXT:    [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX2-NEXT:    [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX2-NEXT:    [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX2-NEXT:    [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 8
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 24
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX2:       for.body.preheader:
+; AVX2-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+; AVX512-LABEL: @foo2(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 32
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 48
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 32
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 48
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* nonnull [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24
+; AVX512-NEXT:    [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
+; AVX512-NEXT:    [[TMP23:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float>
+; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float>
+; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <16 x i32> [[WIDE_LOAD24]] to <16 x float>
+; AVX512-NEXT:    [[TMP26:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX512-NEXT:    [[TMP29:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP26]], <16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 32
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast float* [[TMP34]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP28]], <16 x float>* [[TMP35]], i32 4, <16 x i1> [[TMP12]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 48
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP29]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP13]]), !alias.scope !26, !noalias !28
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 64
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to float
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30
+;
+entry:
+  %A.addr = alloca float*, align 8
+  %B.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %A, float** %A.addr, align 8
+  store float* %B, float** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load float*, float** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds float, float* %5, i64 %idxprom2
+  %6 = load float, float* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to float
+  %add = fadd float %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load float*, float** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %11, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo3(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i++) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i] + trigger[i];
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo3(double* %A, double* %B, i32* %trigger) #0 {
+; AVX1-LABEL: @foo3(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX1-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX1-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX1-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX1-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
+; AVX1-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX1-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
+; AVX1-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
+; AVX1-NEXT:    [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
+; AVX1-NEXT:    [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
+; AVX1-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
+; AVX1-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
+; AVX1-NEXT:    [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX1-NEXT:    [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX1-NEXT:    [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX1-NEXT:    [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
+; AVX1-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX1-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
+; AVX1-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+;
+; AVX2-LABEL: @foo3(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX2-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX2-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX2-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
+; AVX2-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 12
+; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 4
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX2-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 12
+; AVX2-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34
+; AVX2-NEXT:    [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double>
+; AVX2-NEXT:    [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double>
+; AVX2-NEXT:    [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double>
+; AVX2-NEXT:    [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double>
+; AVX2-NEXT:    [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX2-NEXT:    [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX2-NEXT:    [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX2-NEXT:    [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 4
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX2-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 12
+; AVX2-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX2-NEXT:    br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 0, [[ENTRY]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40
+;
+; AVX512-LABEL: @foo3(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000
+; AVX512-NEXT:    [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]]
+; AVX512-NEXT:    [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 8
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 16
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 24
+; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP14]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 8
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 16
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, double* [[TMP14]], i64 24
+; AVX512-NEXT:    [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34
+; AVX512-NEXT:    [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double>
+; AVX512-NEXT:    [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double>
+; AVX512-NEXT:    [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double>
+; AVX512-NEXT:    [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x double>
+; AVX512-NEXT:    [[TMP26:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP22]]
+; AVX512-NEXT:    [[TMP27:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]]
+; AVX512-NEXT:    [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]]
+; AVX512-NEXT:    [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]]
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 8
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP11]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 16
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP28]], <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP30]], i64 24
+; AVX512-NEXT:    [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP29]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !36, !noalias !38
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !39
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP39]] to double
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !40
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code:
+;
+;void foo4(double *A, double *B, int *trigger) {
+;
+;  for (int i=0; i<10000; i += 16) {
+;    if (trigger[i] < 100) {
+;          A[i] = B[i*2] + trigger[i]; << non-cosecutive access
+;    }
+;  }
+;}
+
+; Function Attrs: nounwind uwtable
+define void @foo4(double* %A, double* %B, i32* %trigger)  {
+; AVX1-LABEL: @foo4(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
+; AVX1-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
+; AVX1-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       for.body.1:
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
+; AVX1-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 32
+; AVX1-NEXT:    br label [[FOR_BODY]]
+;
+; AVX2-LABEL: @foo4(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to double
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]]
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16
+; AVX2-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
+; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       for.body.1:
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]]
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP6]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]]
+; AVX2-NEXT:    [[TMP8:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP6]] to double
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP8]], [[CONV_2]]
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP9]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP10]]
+; AVX2-NEXT:    [[TMP11:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP9]] to double
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP11]], [[CONV_3]]
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64
+; AVX2-NEXT:    br label [[FOR_BODY]]
+;
+; AVX512-LABEL: @foo4(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 9985
+; AVX512-NEXT:    [[SCEVGEP12:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 9985
+; AVX512-NEXT:    [[SCEVGEP15:%.*]] = getelementptr double, double* [[B:%.*]], i64 19969
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP12]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND017:%.*]] = icmp ugt double* [[SCEVGEP15]], [[A]]
+; AVX512-NEXT:    [[BOUND118:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
+; AVX512-NEXT:    [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ [[VEC_IND_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112>, [[ENTRY]] ]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP3:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP4:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP4]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP5]], i32 8, <8 x i1> [[TMP3]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP6:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double>
+; AVX512-NEXT:    [[TMP7:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20]], [[TMP6]]
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP7]], <8 x double*> [[TMP8]], i32 8, <8 x i1> [[TMP3]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[VEC_IND_NEXT:%.*]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_1:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_1]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP11:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP11]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20_1:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP12]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP13:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_1]] to <8 x double>
+; AVX512-NEXT:    [[TMP14:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_1]], [[TMP13]]
+; AVX512-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP14]], <8 x double*> [[TMP15]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[VEC_IND_NEXT_1:%.*]] = add <8 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT_1]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER_2:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP16]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef), !alias.scope !41
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_2]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP18:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT_1]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP18]]
+; AVX512-NEXT:    [[WIDE_MASKED_GATHER20_2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP19]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !44
+; AVX512-NEXT:    [[TMP20:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_2]] to <8 x double>
+; AVX512-NEXT:    [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_2]], [[TMP20]]
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT_1]]
+; AVX512-NEXT:    call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP21]], <8 x double*> [[TMP22]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !46, !noalias !48
+; AVX512-NEXT:    [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24
+; AVX512-NEXT:    [[VEC_IND_NEXT_2]] = add <8 x i64> [[VEC_IND]], <i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384, i64 384>
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT_2]], 624
+; AVX512-NEXT:    br i1 [[TMP23]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP24:%.*]] = sub nuw nsw i64 9999, [[INDVARS_IV_PH]]
+; AVX512-NEXT:    br label [[FOR_BODY_PROL:%.*]]
+; AVX512:       for.body.prol:
+; AVX512-NEXT:    [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_INC_PROL:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ]
+; AVX512-NEXT:    [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
+; AVX512-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_PROL]]
+; AVX512-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX_PROL]], align 4
+; AVX512-NEXT:    [[CMP1_PROL:%.*]] = icmp slt i32 [[TMP25]], 100
+; AVX512-NEXT:    br i1 [[CMP1_PROL]], label [[IF_THEN_PROL:%.*]], label [[FOR_INC_PROL]]
+; AVX512:       if.then.prol:
+; AVX512-NEXT:    [[TMP26:%.*]] = shl nuw nsw i64 [[INDVARS_IV_PROL]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_PROL:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP26]]
+; AVX512-NEXT:    [[TMP27:%.*]] = load double, double* [[ARRAYIDX3_PROL]], align 8
+; AVX512-NEXT:    [[CONV_PROL:%.*]] = sitofp i32 [[TMP25]] to double
+; AVX512-NEXT:    [[ADD_PROL:%.*]] = fadd double [[TMP27]], [[CONV_PROL]]
+; AVX512-NEXT:    [[ARRAYIDX7_PROL:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_PROL]]
+; AVX512-NEXT:    store double [[ADD_PROL]], double* [[ARRAYIDX7_PROL]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_PROL]]
+; AVX512:       for.inc.prol:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 16
+; AVX512-NEXT:    [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1
+; AVX512-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0
+; AVX512-NEXT:    br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL]], !llvm.loop !50
+; AVX512:       for.body.prol.loopexit:
+; AVX512-NEXT:    [[DOTMASK:%.*]] = and i64 [[TMP24]], 9984
+; AVX512-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[DOTMASK]], 0
+; AVX512-NEXT:    br i1 [[TMP28]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL_LOOPEXIT]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[TMP30:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP30]]
+; AVX512-NEXT:    [[TMP31:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP29]] to double
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP31]], [[CONV]]
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX7]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP32:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP32]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[TMP33:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP33]]
+; AVX512-NEXT:    [[TMP34:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[CONV_1:%.*]] = sitofp i32 [[TMP32]] to double
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP34]], [[CONV_1]]
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], 32
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP35]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[TMP36:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP36]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[CONV_2:%.*]] = sitofp i32 [[TMP35]] to double
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP37]], [[CONV_2]]
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], 48
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP38]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[TMP39:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP39]]
+; AVX512-NEXT:    [[TMP40:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[CONV_3:%.*]] = sitofp i32 [[TMP38]] to double
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP40]], [[CONV_3]]
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], 64
+; AVX512-NEXT:    [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 10000
+; AVX512-NEXT:    br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !52
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %B.addr = alloca double*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store double* %B, double** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %mul = mul nsw i32 %4, 2
+  %idxprom2 = sext i32 %mul to i64
+  %5 = load double*, double** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %conv = sitofp i32 %9 to double
+  %add = fadd double %6, %conv
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load double*, double** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %11, i64 %idxprom6
+  store double %add, double* %arrayidx7, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+@a = common global [1 x i32*] zeroinitializer, align 8
+@c = common global i32* null, align 8
+
+; The loop here should not be vectorized due to trapping
+; constant expression
+; Function Attrs: nounwind uwtable
+define void @foo5(i32* %A, i32* %B, i32* %trigger) {
+; AVX1-LABEL: @foo5(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX1-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000
+; AVX1-NEXT:    br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+; AVX2-LABEL: @foo5(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX2-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX2-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; AVX2-NEXT:    [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
+; AVX2-NEXT:    br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
+; AVX2:       if.then.4:
+; AVX2-NEXT:    [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX2-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
+; AVX2-NEXT:    br label [[FOR_INC_4]]
+; AVX2:       for.inc.4:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; AVX2-NEXT:    [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
+; AVX2-NEXT:    br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+; AVX512-LABEL: @foo5(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; AVX512-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX512-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; AVX512-NEXT:    [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100
+; AVX512-NEXT:    br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]]
+; AVX512:       if.then.4:
+; AVX512-NEXT:    [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]]
+; AVX512-NEXT:    store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4
+; AVX512-NEXT:    br label [[FOR_INC_4]]
+; AVX512:       for.inc.4:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; AVX512-NEXT:    [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000
+; AVX512-NEXT:    br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8
+  store i32* %B, i32** %B.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp slt i32 %3, 100
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %B.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %8, i64 %idxprom4
+  %9 = load i32, i32* %arrayidx5, align 4
+  %add = add nsw i32 %6, %9
+  %10 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %10 to i64
+  %11 = load i32*, i32** %A.addr, align 8
+  %arrayidx7 = getelementptr inbounds i32, i32* %11, i64 %idxprom6
+  store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 0, i64 1), i32** @c) to i32)), i32* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %12 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %12, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Reverse loop
+;void foo6(double *in, double *out, unsigned size, int *trigger) {
+;
+;  for (int i=SIZE-1; i>=0; i--) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i] + (double) 0.5;
+;    }
+;  }
+;}
+
+
+define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) {
+; AVX1-LABEL: @foo6(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX1-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX1-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX1-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX1-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX1-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX1-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX1-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX1-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX1:       vector.body:
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX1-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
+; AVX1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
+; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
+; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
+; AVX1-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
+; AVX1-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX1-NEXT:    [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
+; AVX1-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
+; AVX1-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
+; AVX1-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
+; AVX1-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
+; AVX1-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
+; AVX1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
+; AVX1-NEXT:    [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
+; AVX1-NEXT:    [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
+; AVX1-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
+; AVX1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
+; AVX1-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
+; AVX1-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
+; AVX1-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
+; AVX1-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
+; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
+; AVX1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX1-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX1-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX1:       for.body:
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX1:       if.then:
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX1-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX1-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX1-NEXT:    br label [[FOR_INC]]
+; AVX1:       for.inc:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX1-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX1-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX1-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]]
+; AVX1:       for.end:
+; AVX1-NEXT:    ret void
+; AVX1:       if.then.1:
+; AVX1-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX1-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX1-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX1-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX1-NEXT:    br label [[FOR_INC_1]]
+; AVX1:       for.inc.1:
+; AVX1-NEXT:    [[INDVARS_IV_NEXT_1]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX1-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
+; AVX1-NEXT:    br i1 [[CMP_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+;
+; AVX2-LABEL: @foo6(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX2-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX2-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX2-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX2-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX2-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX2-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX2-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX2-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX2-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX2-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX2:       vector.body:
+; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
+; AVX2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -4
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -3
+; AVX2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -3
+; AVX2-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -12
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -3
+; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41
+; AVX2-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer
+; AVX2-NEXT:    [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer
+; AVX2-NEXT:    [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -3
+; AVX2-NEXT:    [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -4
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -3
+; AVX2-NEXT:    [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -3
+; AVX2-NEXT:    [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -12
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -3
+; AVX2-NEXT:    [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44
+; AVX2-NEXT:    [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -3
+; AVX2-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -4
+; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -3
+; AVX2-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -3
+; AVX2-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -12
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -3
+; AVX2-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>*
+; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48
+; AVX2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX2-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49
+; AVX2:       for.body:
+; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX2:       if.then:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX2-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX2-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX2-NEXT:    br label [[FOR_INC]]
+; AVX2:       for.inc:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX2-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX2-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX2-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX2:       for.end:
+; AVX2-NEXT:    ret void
+; AVX2:       if.then.1:
+; AVX2-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX2-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX2-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_1]]
+; AVX2:       for.inc.1:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX2-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX2-NEXT:    [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
+; AVX2-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX2:       if.then.2:
+; AVX2-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX2-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX2-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_2]]
+; AVX2:       for.inc.2:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
+; AVX2-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX2-NEXT:    [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
+; AVX2-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX2:       if.then.3:
+; AVX2-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX2-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
+; AVX2-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX2-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
+; AVX2-NEXT:    br label [[FOR_INC_3]]
+; AVX2:       for.inc.3:
+; AVX2-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
+; AVX2-NEXT:    [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
+; AVX2-NEXT:    br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50
+;
+; AVX512-LABEL: @foo6(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096
+; AVX512-NEXT:    [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096
+; AVX512-NEXT:    [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096
+; AVX512-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double*
+; AVX512-NEXT:    [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32*
+; AVX512-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]]
+; AVX512-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; AVX512-NEXT:    [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]]
+; AVX512-NEXT:    [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]]
+; AVX512-NEXT:    [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]]
+; AVX512-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]]
+; AVX512-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]]
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -7
+; AVX512-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -8
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -16
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 -7
+; AVX512-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -24
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 -7
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53
+; AVX512-NEXT:    [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer
+; AVX512-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer
+; AVX512-NEXT:    [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer
+; AVX512-NEXT:    [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -7
+; AVX512-NEXT:    [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -8
+; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[TMP21]], i64 -7
+; AVX512-NEXT:    [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -16
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 -7
+; AVX512-NEXT:    [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP18]], i64 -24
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP27]], i64 -7
+; AVX512-NEXT:    [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* nonnull [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56
+; AVX512-NEXT:    [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]]
+; AVX512-NEXT:    [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -7
+; AVX512-NEXT:    [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -8
+; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i64 -7
+; AVX512-NEXT:    [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -16
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr inbounds double, double* [[TMP40]], i64 -7
+; AVX512-NEXT:    [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr inbounds double, double* [[TMP34]], i64 -24
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP43]], i64 -7
+; AVX512-NEXT:    [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; AVX512-NEXT:    br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0
+; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; AVX512-NEXT:    [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double [[ADD]], double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
+; AVX512-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; AVX512-NEXT:    [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0
+; AVX512-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]]
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+; AVX512:       if.then.1:
+; AVX512-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8
+; AVX512-NEXT:    [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]]
+; AVX512-NEXT:    store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_1]]
+; AVX512:       for.inc.1:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2
+; AVX512-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; AVX512-NEXT:    [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0
+; AVX512-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]]
+; AVX512:       if.then.2:
+; AVX512-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8
+; AVX512-NEXT:    [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]]
+; AVX512-NEXT:    store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_2]]
+; AVX512:       for.inc.2:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3
+; AVX512-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; AVX512-NEXT:    [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0
+; AVX512-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]]
+; AVX512:       if.then.3:
+; AVX512-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8
+; AVX512-NEXT:    [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01
+; AVX512-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]]
+; AVX512-NEXT:    store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8
+; AVX512-NEXT:    br label [[FOR_INC_3]]
+; AVX512:       for.inc.3:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4
+; AVX512-NEXT:    [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0
+; AVX512-NEXT:    br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !62
+;
+entry:
+  %in.addr = alloca double*, align 8
+  %out.addr = alloca double*, align 8
+  %size.addr = alloca i32, align 4
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store double* %in, double** %in.addr, align 8
+  store double* %out, double** %out.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 4095, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp sge i32 %0, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load double*, double** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %5, i64 %idxprom2
+  %6 = load double, double* %arrayidx3, align 8
+  %add = fadd double %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %8, i64 %idxprom4
+  store double %add, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %dec = add nsw i32 %9, -1
+  store i32 %dec, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; void foo7 (double * __restrict__  out, double ** __restrict__  in,
+;           bool * __restrict__ trigger, unsigned size) {
+;
+;  for (unsigned i=0; i<size; i++)
+;    if (trigger[i] && (in[i] != 0))
+;      out[i] = (double) 0.5;
+; }
+
+define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 {
+; AVX-LABEL: @foo7(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX:       for.body.preheader:
+; AVX-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 12
+; AVX-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x double*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX:       for.body.preheader16:
+; AVX-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX:       land.lhs.true:
+; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX:       if.then:
+; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX-NEXT:    br label [[FOR_INC]]
+; AVX:       for.inc:
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !52
+; AVX:       for.end:
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @foo7(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX512:       vector.ph:
+; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast double** [[TMP12]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast double** [[TMP14]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double*, double** [[TMP12]], i64 24
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x double*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63
+; AVX512:       middle.block:
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX512:       for.body.preheader16:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX512:       land.lhs.true:
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load double*, double** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq double* [[TMP39]], null
+; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !64
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+entry:
+  %out.addr = alloca double*, align 8
+  %in.addr = alloca double**, align 8
+  %trigger.addr = alloca i8*, align 8
+  %size.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store double* %out, double** %out.addr, align 8
+  store double** %in, double*** %in.addr, align 8
+  store i8* %trigger, i8** %trigger.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %size.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = zext i32 %2 to i64
+  %3 = load i8*, i8** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = zext i32 %5 to i64
+  %6 = load double**, double*** %in.addr, align 8
+  %arrayidx2 = getelementptr inbounds double*, double** %6, i64 %idxprom1
+  %7 = load double*, double** %arrayidx2, align 8
+  %cmp3 = icmp ne double* %7, null
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %8 = load i32, i32* %i, align 4
+  %idxprom4 = zext i32 %8 to i64
+  %9 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4
+  store double 5.000000e-01, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %10 = load i32, i32* %i, align 4
+  %inc = add i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+;typedef int (*fp)();
+;void foo8 (double* __restrict__  out, fp* __restrict__ in, bool * __restrict__ trigger, unsigned size) {
+;
+;  for (unsigned i=0; i<size; i++)
+;    if (trigger[i] && (in[i] != 0))
+;      out[i] = (double) 0.5;
+;}
+
+define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 {
+; AVX-LABEL: @foo8(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX:       for.body.preheader:
+; AVX-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16
+; AVX-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX:       vector.ph:
+; AVX-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280
+; AVX-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX:       vector.body:
+; AVX-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 4
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
+; AVX-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP8:%.*]] = trunc <4 x i8> [[WIDE_LOAD]] to <4 x i1>
+; AVX-NEXT:    [[TMP9:%.*]] = trunc <4 x i8> [[WIDE_LOAD10]] to <4 x i1>
+; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i8> [[WIDE_LOAD11]] to <4 x i1>
+; AVX-NEXT:    [[TMP11:%.*]] = trunc <4 x i8> [[WIDE_LOAD12]] to <4 x i1>
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP13]], i32 8, <4 x i1> [[TMP8]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP15]], i32 8, <4 x i1> [[TMP9]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP17]], i32 8, <4 x i1> [[TMP10]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 12
+; AVX-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>*
+; AVX-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* nonnull [[TMP19]], i32 8, <4 x i1> [[TMP11]], <4 x i32 ()*> undef)
+; AVX-NEXT:    [[TMP20:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX-NEXT:    [[TMP21:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX-NEXT:    [[TMP25:%.*]] = and <4 x i1> [[TMP20]], [[TMP8]]
+; AVX-NEXT:    [[TMP26:%.*]] = and <4 x i1> [[TMP21]], [[TMP9]]
+; AVX-NEXT:    [[TMP27:%.*]] = and <4 x i1> [[TMP22]], [[TMP10]]
+; AVX-NEXT:    [[TMP28:%.*]] = and <4 x i1> [[TMP23]], [[TMP11]]
+; AVX-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP29]], i32 8, <4 x i1> [[TMP25]])
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 4
+; AVX-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP26]])
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP27]])
+; AVX-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 12
+; AVX-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
+; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP28]])
+; AVX-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; AVX-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54
+; AVX:       middle.block:
+; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX:       for.body.preheader16:
+; AVX-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX:       land.lhs.true:
+; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX:       if.then:
+; AVX-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX-NEXT:    br label [[FOR_INC]]
+; AVX:       for.inc:
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !55
+; AVX:       for.end:
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @foo8(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; AVX512-NEXT:    br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; AVX512:       for.body.preheader:
+; AVX512-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64
+; AVX512-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32
+; AVX512-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]]
+; AVX512:       vector.ph:
+; AVX512-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264
+; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AVX512:       vector.body:
+; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
+; AVX512-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
+; AVX512-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
+; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
+; AVX512-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
+; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; AVX512-NEXT:    [[TMP8:%.*]] = trunc <8 x i8> [[WIDE_LOAD]] to <8 x i1>
+; AVX512-NEXT:    [[TMP9:%.*]] = trunc <8 x i8> [[WIDE_LOAD10]] to <8 x i1>
+; AVX512-NEXT:    [[TMP10:%.*]] = trunc <8 x i8> [[WIDE_LOAD11]] to <8 x i1>
+; AVX512-NEXT:    [[TMP11:%.*]] = trunc <8 x i8> [[WIDE_LOAD12]] to <8 x i1>
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 ()** [[TMP12]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP13]], i32 8, <8 x i1> [[TMP8]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 8
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 ()** [[TMP14]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP15]], i32 8, <8 x i1> [[TMP9]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 16
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP17]], i32 8, <8 x i1> [[TMP10]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[TMP12]], i64 24
+; AVX512-NEXT:    [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* nonnull [[TMP19]], i32 8, <8 x i1> [[TMP11]], <8 x i32 ()*> undef)
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer
+; AVX512-NEXT:    [[TMP23:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]]
+; AVX512-NEXT:    [[TMP25:%.*]] = and <8 x i1> [[TMP20]], [[TMP8]]
+; AVX512-NEXT:    [[TMP26:%.*]] = and <8 x i1> [[TMP21]], [[TMP9]]
+; AVX512-NEXT:    [[TMP27:%.*]] = and <8 x i1> [[TMP22]], [[TMP10]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <8 x i1> [[TMP23]], [[TMP11]]
+; AVX512-NEXT:    [[TMP29:%.*]] = bitcast double* [[TMP24]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP25]])
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 8
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP26]])
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 16
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP27]])
+; AVX512-NEXT:    [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i64 24
+; AVX512-NEXT:    [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP28]])
+; AVX512-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX512-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66
+; AVX512:       middle.block:
+; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]]
+; AVX512:       for.body.preheader16:
+; AVX512-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX512:       for.body:
+; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ]
+; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP37:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; AVX512-NEXT:    [[TMP38:%.*]] = and i8 [[TMP37]], 1
+; AVX512-NEXT:    [[TOBOOL:%.*]] = icmp eq i8 [[TMP38]], 0
+; AVX512-NEXT:    br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]]
+; AVX512:       land.lhs.true:
+; AVX512-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    [[TMP39:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8
+; AVX512-NEXT:    [[CMP3:%.*]] = icmp eq i32 ()* [[TMP39]], null
+; AVX512-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]]
+; AVX512:       if.then:
+; AVX512-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]]
+; AVX512-NEXT:    store double 5.000000e-01, double* [[ARRAYIDX5]], align 8
+; AVX512-NEXT:    br label [[FOR_INC]]
+; AVX512:       for.inc:
+; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !67
+; AVX512:       for.end:
+; AVX512-NEXT:    ret void
+;
+entry:
+  %out.addr = alloca double*, align 8
+  %in.addr = alloca i32 ()**, align 8
+  %trigger.addr = alloca i8*, align 8
+  %size.addr = alloca i32, align 4
+  %i = alloca i32, align 4
+  store double* %out, double** %out.addr, align 8
+  store i32 ()** %in, i32 ()*** %in.addr, align 8
+  store i8* %trigger, i8** %trigger.addr, align 8
+  store i32 %size, i32* %size.addr, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %1 = load i32, i32* %size.addr, align 4
+  %cmp = icmp ult i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4
+  %idxprom = zext i32 %2 to i64
+  %3 = load i8*, i8** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i8, i8* %3, i64 %idxprom
+  %4 = load i8, i8* %arrayidx, align 1
+  %tobool = trunc i8 %4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %idxprom1 = zext i32 %5 to i64
+  %6 = load i32 ()**, i32 ()*** %in.addr, align 8
+  %arrayidx2 = getelementptr inbounds i32 ()*, i32 ()** %6, i64 %idxprom1
+  %7 = load i32 ()*, i32 ()** %arrayidx2, align 8
+  %cmp3 = icmp ne i32 ()* %7, null
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  %8 = load i32, i32* %i, align 4
+  %idxprom4 = zext i32 %8 to i64
+  %9 = load double*, double** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %9, i64 %idxprom4
+  store double 5.000000e-01, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %land.lhs.true, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %10 = load i32, i32* %i, align 4
+  %inc = add i32 %10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/max-mstore.ll b/llvm/test/Transforms/LoopVectorize/X86/max-mstore.ll
new file mode 100644
index 00000000000..a9ac04d4560
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/max-mstore.ll
@@ -0,0 +1,46 @@
+; RUN: opt -basicaa -loop-vectorize -force-vector-interleave=1 -S -mcpu=core-avx2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [256 x i32] zeroinitializer, align 16
+@a = common global [256 x i32] zeroinitializer, align 16
+
+; unsigned int a[256], b[256];
+; void foo() {
+;  for (i = 0; i < 256; i++) {
+;    if (b[i] > a[i])
+;      a[i] = b[i];
+;  }
+; }
+
+; CHECK-LABEL: foo
+; CHECK: load <8 x i32>
+; CHECK: icmp ugt <8 x i32>
+; CHECK: masked.store
+
+define void @foo() {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* @b, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [256 x i32], [256 x i32]* @a, i64 0, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %cmp3 = icmp ugt i32 %0, %1
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i32 %0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
new file mode 100644
index 00000000000..709e69fbb1d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -0,0 +1,2473 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-threshold=150 -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DEFAULT
+; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -unroll-threshold=150 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
+
+; This file tests the llvm.loop.vectorize.enable metadata forcing
+; vectorization even when optimization levels are too low, or when
+; vectorization is disabled.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @enabled(
+; O1-NEXT:  entry:
+; O1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP78]]
+;
+; O2-LABEL: @enabled(
+; O2-NEXT:  entry:
+; O2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP78]]
+;
+; O3-LABEL: @enabled(
+; O3-NEXT:  entry:
+; O3-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP78]]
+;
+; O3DEFAULT-LABEL: @enabled(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+;
+; Os-LABEL: @enabled(
+; Os-NEXT:  entry:
+; Os-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Os-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP78]]
+;
+; Oz-LABEL: @enabled(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Oz-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Oz-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Oz-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Oz-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Oz-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Oz-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Oz-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Oz-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Oz-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Oz-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Oz-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Oz-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Oz-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Oz-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Oz-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Oz-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Oz-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Oz-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Oz-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Oz-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Oz-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Oz-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Oz-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Oz-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Oz-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Oz-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Oz-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Oz-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Oz-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Oz-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Oz-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Oz-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Oz-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Oz-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Oz-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Oz-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Oz-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Oz-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Oz-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Oz-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Oz-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Oz-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Oz-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Oz-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Oz-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Oz-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Oz-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Oz-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Oz-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Oz-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Oz-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Oz-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Oz-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Oz-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Oz-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Oz-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Oz-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Oz-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Oz-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Oz-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Oz-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Oz-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Oz-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Oz-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Oz-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Oz-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Oz-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC-LABEL: @enabled(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC-LABEL: @enabled(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC2-LABEL: @enabled(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC2-LABEL: @enabled(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP78]]
+;
+; O3DIS-LABEL: @enabled(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DIS-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DIS-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DIS-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DIS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DIS-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DIS-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DIS-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DIS-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DIS-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DIS-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DIS-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DIS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DIS-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DIS-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DIS-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DIS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DIS-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DIS-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DIS-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DIS-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DIS-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DIS-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DIS-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DIS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DIS-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DIS-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DIS-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DIS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DIS-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DIS-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DIS-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DIS-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DIS-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DIS-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DIS-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DIS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DIS-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DIS-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DIS-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DIS-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DIS-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DIS-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DIS-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DIS-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DIS-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DIS-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DIS-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DIS-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DIS-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DIS-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DIS-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DIS-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DIS-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DIS-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DIS-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DIS-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DIS-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DIS-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DIS-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DIS-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DIS-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DIS-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DIS-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DIS-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DIS-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DIS-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DIS-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP78]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @nopragma(
+; O1-NEXT:  entry:
+; O1-NEXT:    br label [[FOR_BODY:%.*]]
+; O1:       for.body:
+; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; O1:       for.end:
+; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP1]]
+;
+; O2-LABEL: @nopragma(
+; O2-NEXT:  entry:
+; O2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O2-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O2-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O2-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O2-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O2-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O2-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O2-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O2-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O2-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O2-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O2-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O2-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O2-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O2-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O2-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O2-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O2-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O2-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O2-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O2-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O2-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O2-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O2-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O2-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O2-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O2-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O2-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O2-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O2-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O2-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O2-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O2-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O2-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O2-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O2-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O2-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O2-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O2-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O2-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O2-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O2-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O2-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP78]]
+;
+; O3-LABEL: @nopragma(
+; O3-NEXT:  entry:
+; O3-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP78]]
+;
+; O3DEFAULT-LABEL: @nopragma(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O3DEFAULT-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O3DEFAULT-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O3DEFAULT-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O3DEFAULT-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O3DEFAULT-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O3DEFAULT-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O3DEFAULT-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O3DEFAULT-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O3DEFAULT-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O3DEFAULT-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O3DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O3DEFAULT-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP78]]
+;
+; Os-LABEL: @nopragma(
+; Os-NEXT:  entry:
+; Os-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; Os-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; Os-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; Os-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; Os-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; Os-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; Os-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; Os-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; Os-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; Os-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; Os-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; Os-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; Os-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; Os-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; Os-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; Os-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; Os-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; Os-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; Os-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; Os-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; Os-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; Os-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; Os-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; Os-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; Os-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; Os-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; Os-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; Os-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; Os-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; Os-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; Os-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; Os-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; Os-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; Os-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; Os-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; Os-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; Os-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; Os-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; Os-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; Os-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; Os-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; Os-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; Os-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; Os-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; Os-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; Os-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; Os-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; Os-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; Os-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; Os-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; Os-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; Os-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; Os-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; Os-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; Os-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; Os-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; Os-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; Os-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP78]]
+;
+; Oz-LABEL: @nopragma(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    br label [[FOR_BODY:%.*]]
+; Oz:       for.body:
+; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; Oz:       for.end:
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC-LABEL: @nopragma(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O1VEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; O1VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O1VEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; O1VEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O1VEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; O1VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O1VEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; O1VEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O1VEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; O1VEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O1VEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O1VEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O1VEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; O1VEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O1VEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; O1VEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O1VEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; O1VEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O1VEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; O1VEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O1VEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; O1VEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O1VEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; O1VEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O1VEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; O1VEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O1VEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O1VEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O1VEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; O1VEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O1VEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; O1VEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O1VEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; O1VEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O1VEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; O1VEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O1VEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; O1VEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O1VEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; O1VEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O1VEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; O1VEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O1VEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; O1VEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O1VEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; O1VEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; O1VEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; O1VEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; O1VEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; O1VEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; O1VEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; O1VEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; O1VEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; O1VEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; O1VEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; O1VEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; O1VEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; O1VEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; O1VEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; O1VEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; O1VEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; O1VEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; O1VEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; O1VEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; O1VEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP78]]
+;
+; OzVEC-LABEL: @nopragma(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; OzVEC-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
+; OzVEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; OzVEC-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; OzVEC-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; OzVEC-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 4
+; OzVEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; OzVEC-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
+; OzVEC-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; OzVEC-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
+; OzVEC-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; OzVEC-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; OzVEC-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; OzVEC-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
+; OzVEC-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; OzVEC-NEXT:    [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP19]], align 4
+; OzVEC-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; OzVEC-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
+; OzVEC-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; OzVEC-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; OzVEC-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; OzVEC-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* [[TMP27]], align 4
+; OzVEC-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; OzVEC-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
+; OzVEC-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; OzVEC-NEXT:    [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP30]], <4 x i32>* [[TMP32]], align 4
+; OzVEC-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; OzVEC-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; OzVEC-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; OzVEC-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP35]], <4 x i32>* [[TMP37]], align 4
+; OzVEC-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; OzVEC-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP39]], align 4
+; OzVEC-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; OzVEC-NEXT:    [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP42]], align 4
+; OzVEC-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; OzVEC-NEXT:    [[TMP44:%.*]] = bitcast i32* [[TMP43]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP44]], align 4
+; OzVEC-NEXT:    [[TMP45:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; OzVEC-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP45]], <4 x i32>* [[TMP47]], align 4
+; OzVEC-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; OzVEC-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP49]], align 4
+; OzVEC-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; OzVEC-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP50]], <4 x i32>* [[TMP52]], align 4
+; OzVEC-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; OzVEC-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP54]], align 4
+; OzVEC-NEXT:    [[TMP55:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; OzVEC-NEXT:    [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP55]], <4 x i32>* [[TMP57]], align 4
+; OzVEC-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 48
+; OzVEC-NEXT:    [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP59]], align 4
+; OzVEC-NEXT:    [[TMP60:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 48
+; OzVEC-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP60]], <4 x i32>* [[TMP62]], align 4
+; OzVEC-NEXT:    [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 52
+; OzVEC-NEXT:    [[TMP64:%.*]] = bitcast i32* [[TMP63]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP64]], align 4
+; OzVEC-NEXT:    [[TMP65:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 52
+; OzVEC-NEXT:    [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP65]], <4 x i32>* [[TMP67]], align 4
+; OzVEC-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 56
+; OzVEC-NEXT:    [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, <4 x i32>* [[TMP69]], align 4
+; OzVEC-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 56
+; OzVEC-NEXT:    [[TMP72:%.*]] = bitcast i32* [[TMP71]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP70]], <4 x i32>* [[TMP72]], align 4
+; OzVEC-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 60
+; OzVEC-NEXT:    [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <4 x i32>*
+; OzVEC-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP74]], align 4
+; OzVEC-NEXT:    [[TMP75:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT2]]
+; OzVEC-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 60
+; OzVEC-NEXT:    [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; OzVEC-NEXT:    store <4 x i32> [[TMP75]], <4 x i32>* [[TMP77]], align 4
+; OzVEC-NEXT:    [[TMP78:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP78]]
+;
+; O1VEC2-LABEL: @nopragma(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; O1VEC2:       vector.ph:
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; O1VEC2:       vector.body:
+; O1VEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; O1VEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; O1VEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; O1VEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; O1VEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; O1VEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; O1VEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; O1VEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; O1VEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; O1VEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; O1VEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; O1VEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; O1VEC2:       middle.block:
+; O1VEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
+; O1VEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; O1VEC2:       scalar.ph:
+; O1VEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC2:       for.body:
+; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; O1VEC2:       for.end:
+; O1VEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP10]]
+;
+; OzVEC2-LABEL: @nopragma(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; OzVEC2:       vector.ph:
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; OzVEC2:       vector.body:
+; OzVEC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; OzVEC2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; OzVEC2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; OzVEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; OzVEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; OzVEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; OzVEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
+; OzVEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; OzVEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
+; OzVEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; OzVEC2-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; OzVEC2-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; OzVEC2:       middle.block:
+; OzVEC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 64, 64
+; OzVEC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; OzVEC2:       scalar.ph:
+; OzVEC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC2:       for.body:
+; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP9]], [[N]]
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; OzVEC2:       for.end:
+; OzVEC2-NEXT:    [[TMP10:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP10]]
+;
+; O3DIS-LABEL: @nopragma(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
+; O3DIS:       for.body:
+; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 64
+; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; O3DIS:       for.end:
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 64
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+; O1-LABEL: @disabled(
+; O1-NEXT:  entry:
+; O1-NEXT:    br label [[FOR_BODY:%.*]]
+; O1:       for.body:
+; O1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O1:       for.end:
+; O1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1-NEXT:    ret i32 [[TMP1]]
+;
+; O2-LABEL: @disabled(
+; O2-NEXT:  entry:
+; O2-NEXT:    br label [[FOR_BODY:%.*]]
+; O2:       for.body:
+; O2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O2:       for.end:
+; O2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O2-NEXT:    ret i32 [[TMP1]]
+;
+; O3-LABEL: @disabled(
+; O3-NEXT:  entry:
+; O3-NEXT:    br label [[FOR_BODY:%.*]]
+; O3:       for.body:
+; O3-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O3-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O3:       for.end:
+; O3-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3-NEXT:    ret i32 [[TMP1]]
+;
+; O3DEFAULT-LABEL: @disabled(
+; O3DEFAULT-NEXT:  entry:
+; O3DEFAULT-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; O3DEFAULT-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP1]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX_4]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_4]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX_8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX2_8]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX_12]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX2_12]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX_16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP19]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX2_16]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX_20]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX2_20]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX_24]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = load <4 x i32>, <4 x i32>* [[TMP26]], align 4
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX2_24]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = bitcast i32* [[ARRAYIDX_28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = load <4 x i32>, <4 x i32>* [[TMP30]], align 4
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX2_28]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], <4 x i32>* [[TMP33]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[ARRAYIDX_32]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP35:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX2_32]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP38:%.*]] = bitcast i32* [[ARRAYIDX_36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = load <4 x i32>, <4 x i32>* [[TMP38]], align 4
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[ARRAYIDX2_36]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP41]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[ARRAYIDX_40]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = load <4 x i32>, <4 x i32>* [[TMP42]], align 4
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = bitcast i32* [[ARRAYIDX2_40]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* [[TMP45]], align 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP46:%.*]] = bitcast i32* [[ARRAYIDX_44]] to <4 x i32>*
+; O3DEFAULT-NEXT:    [[TMP47:%.*]] = load <4 x i32>, <4 x i32>* [[TMP46]], align 4
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP3]]
+; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[ARRAYIDX2_44]] to <4 x i32>*
+; O3DEFAULT-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* [[TMP49]], align 4
+; O3DEFAULT-NEXT:    [[TMP50:%.*]] = load i32, i32* [[A]], align 4
+; O3DEFAULT-NEXT:    ret i32 [[TMP50]]
+;
+; Os-LABEL: @disabled(
+; Os-NEXT:  entry:
+; Os-NEXT:    br label [[FOR_BODY:%.*]]
+; Os:       for.body:
+; Os-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Os-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Os-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Os-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Os-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Os-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Os-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; Os-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; Os:       for.end:
+; Os-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Os-NEXT:    ret i32 [[TMP1]]
+;
+; Oz-LABEL: @disabled(
+; Oz-NEXT:  entry:
+; Oz-NEXT:    br label [[FOR_BODY:%.*]]
+; Oz:       for.body:
+; Oz-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; Oz-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; Oz-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; Oz-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; Oz-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; Oz-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; Oz-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; Oz-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; Oz:       for.end:
+; Oz-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; Oz-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC-LABEL: @disabled(
+; O1VEC-NEXT:  entry:
+; O1VEC-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC:       for.body:
+; O1VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1VEC-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1VEC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1VEC-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1VEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O1VEC:       for.end:
+; O1VEC-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC-NEXT:    ret i32 [[TMP1]]
+;
+; OzVEC-LABEL: @disabled(
+; OzVEC-NEXT:  entry:
+; OzVEC-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC:       for.body:
+; OzVEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; OzVEC-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; OzVEC-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; OzVEC-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; OzVEC-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; OzVEC:       for.end:
+; OzVEC-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC-NEXT:    ret i32 [[TMP1]]
+;
+; O1VEC2-LABEL: @disabled(
+; O1VEC2-NEXT:  entry:
+; O1VEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; O1VEC2:       for.body:
+; O1VEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O1VEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O1VEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O1VEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O1VEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O1VEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O1VEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O1VEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !4
+; O1VEC2:       for.end:
+; O1VEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O1VEC2-NEXT:    ret i32 [[TMP1]]
+;
+; OzVEC2-LABEL: @disabled(
+; OzVEC2-NEXT:  entry:
+; OzVEC2-NEXT:    br label [[FOR_BODY:%.*]]
+; OzVEC2:       for.body:
+; OzVEC2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; OzVEC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; OzVEC2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; OzVEC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; OzVEC2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; OzVEC2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; OzVEC2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; OzVEC2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !4
+; OzVEC2:       for.end:
+; OzVEC2-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; OzVEC2-NEXT:    ret i32 [[TMP1]]
+;
+; O3DIS-LABEL: @disabled(
+; O3DIS-NEXT:  entry:
+; O3DIS-NEXT:    br label [[FOR_BODY:%.*]]
+; O3DIS:       for.body:
+; O3DIS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; O3DIS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; O3DIS-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[N:%.*]]
+; O3DIS-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; O3DIS-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
+; O3DIS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; O3DIS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 48
+; O3DIS-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !0
+; O3DIS:       for.end:
+; O3DIS-NEXT:    [[TMP1:%.*]] = load i32, i32* [[A]], align 4
+; O3DIS-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %N
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 48
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32, i32* %a, align 4
+  ret i32 %1
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 1}
+!2 = !{!2, !3}
+!3 = !{!"llvm.loop.vectorize.enable", i1 0}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
new file mode 100644
index 00000000000..bb972c4488a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll
@@ -0,0 +1,24 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -vectorizer-min-trip-count=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: <4 x float>
+define void @trivial_loop(float* nocapture %a) nounwind uwtable optsize {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, 1.000000e+00
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
new file mode 100644
index 00000000000..ad79e38cafa
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/mul_slm_16bit.ll
@@ -0,0 +1,145 @@
+; REQUIRES: asserts
+; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i8 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+; sources of the mul is sext\sext from i8 
+; use pmullw\sext seq.   
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i8
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i8 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i8
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i8 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-120
+; use pmullw\sext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -120, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\250
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 250, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-120
+; use pmulhw\pmullw\pshuf
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -120, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\250
+; use pmullw\zext
+; SLM:  cost of 3 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
+entry:
+  %cmp12 = icmp eq i32 %N, 0
+  br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %phitmp = trunc i32 %add4 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
+  ret i16 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
+  %0 = load i16, i16* %arrayidx, align 1
+  %conv = sext i16 %0 to i32
+  %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = sext i16 %1 to i32
+; sources of the mul is sext\sext from i16 
+; use pmulhw\pmullw\pshuf seq.   
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32  
+  %mul = mul nsw i32 %conv3, %conv
+; sources of the mul is zext\sext from i16
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %conv4 = zext i16 %1 to i32
+  %mul2 = mul nsw i32 %conv4, %conv
+  %sum0 = add i32 %mul, %mul2
+; sources of the mul is zext\zext from i16
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %conv5 = zext i16 %0 to i32
+  %mul3 = mul nsw i32 %conv5, %conv4
+  %sum1 = add i32 %sum0, %mul3
+; sources of the mul is sext\-32000
+; use pmulhw\pmullw\sext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul4 = mul nsw i32 -32000, %conv3
+  %sum2 = add i32 %sum1, %mul4
+; sources of the mul is sext\64000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul5 = mul nsw i32 64000, %conv3
+  %sum3 = add i32 %sum2, %mul5
+; sources of the mul is zext\-32000
+; use pmulld
+; SLM:  cost of 11 for VF 4 {{.*}} mul nsw i32
+  %mul6 = mul nsw i32 -32000, %conv4
+  %sum4 = add i32 %sum3, %mul6
+; sources of the mul is zext\64000
+; use pmulhw\pmullw\zext
+; SLM:  cost of 5 for VF 4 {{.*}} mul nsw i32
+  %mul7 = mul nsw i32 250, %conv4
+  %sum5 = add i32 %sum4, %mul7
+  %add = add i32 %acc.013, 5
+  %add4 = add i32 %add, %sum5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no-vector.ll b/llvm/test/Transforms/LoopVectorize/X86/no-vector.ll
new file mode 100644
index 00000000000..4b464b01267
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/no-vector.ll
@@ -0,0 +1,22 @@
+; RUN: opt -S -mtriple=i386-unknown-freebsd -mcpu=i486 -loop-vectorize < %s
+
+define i32 @PR14639(i8* nocapture %s, i32 %len) nounwind {
+entry:
+  %cmp4 = icmp sgt i32 %len, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.05 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %s, i32 %i.06
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = sext i8 %0 to i32
+  %xor = xor i32 %conv, %r.05
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, %len
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %r.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
new file mode 100644
index 00000000000..177c120b3df
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+
+; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
+; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2)
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !10
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !9
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !15
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !9
+  %add = fadd double %a.08, %cond, !dbg !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !8
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !8
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !8, !llvm.loop !17
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !22
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !23
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !23
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !24
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !22
+  %add = fadd double %a.08, %cond, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !21
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !21, !llvm.loop !26
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!28}
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 3.7.0"}
+!3 = !DILocation(line: 5, column: 20, scope: !4)
+!4 = distinct !DISubprogram(name: "cond_sum", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!5 = !DIFile(filename: "no_fpmath.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 5, column: 3, scope: !4)
+!9 = !DILocation(line: 6, column: 14, scope: !4)
+!10 = !DILocation(line: 9, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 6, column: 19, scope: !4)
+!16 = !DILocation(line: 6, column: 11, scope: !4)
+!17 = distinct !{!17, !18}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 16, column: 20, scope: !20)
+!20 = distinct !DISubprogram(name: "cond_sum_loop_hint", scope: !5, file: !5, line: 12, type: !6, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!21 = !DILocation(line: 16, column: 3, scope: !20)
+!22 = !DILocation(line: 17, column: 14, scope: !20)
+!23 = !DILocation(line: 20, column: 3, scope: !20)
+!24 = !DILocation(line: 17, column: 19, scope: !20)
+!25 = !DILocation(line: 17, column: 11, scope: !20)
+!26 = distinct !{!26, !27, !18}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}
+!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
new file mode 100644
index 00000000000..fbc4ab2aa86
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/no_fpmath_with_hotness.ll
@@ -0,0 +1,113 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
+
+; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations (hotness: 300)
+; CHECK: remark: no_fpmath.c:6:14: loop not vectorized
+; CHECK: remark: no_fpmath.c:17:14: vectorized loop (vectorization width: 2, interleaved count: 2) (hotness: 300)
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 !prof !29 {
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !3
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8, !prof !30
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !9
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !10
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !10
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !9
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !9, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !15
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !9
+  %add = fadd double %a.08, %cond, !dbg !16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !8
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !8
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !8, !llvm.loop !17, !prof !31
+}
+
+; Function Attrs: nounwind readonly ssp uwtable
+define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 !prof !29{
+entry:
+  %cmp.7 = icmp sgt i32 %n, 0, !dbg !19
+  br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21, !prof !30
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !22
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %add.lcssa = phi double [ %add, %for.body ]
+  br label %for.cond.cleanup, !dbg !23
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %a.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret double %a.0.lcssa, !dbg !23
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %a.08 = phi double [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %v, i64 %indvars.iv, !dbg !22
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !22, !tbaa !11
+  %cmp1 = icmp eq i32 %0, 0, !dbg !24
+  %cond = select i1 %cmp1, double 3.400000e+00, double 1.150000e+00, !dbg !22
+  %add = fadd double %a.08, %cond, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %n, !dbg !21
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !dbg !21, !llvm.loop !26, !prof !31
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!28}
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !{i32 1, !"PIC Level", i32 2}
+!2 = !{!"clang version 3.7.0"}
+!3 = !DILocation(line: 5, column: 20, scope: !4)
+!4 = distinct !DISubprogram(name: "cond_sum", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!5 = !DIFile(filename: "no_fpmath.c", directory: "")
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !DILocation(line: 5, column: 3, scope: !4)
+!9 = !DILocation(line: 6, column: 14, scope: !4)
+!10 = !DILocation(line: 9, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 6, column: 19, scope: !4)
+!16 = !DILocation(line: 6, column: 11, scope: !4)
+!17 = distinct !{!17, !18}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 16, column: 20, scope: !20)
+!20 = distinct !DISubprogram(name: "cond_sum_loop_hint", scope: !5, file: !5, line: 12, type: !6, isLocal: false, isDefinition: true, scopeLine: 12, flags: DIFlagPrototyped, isOptimized: true, unit: !28, retainedNodes: !7)
+!21 = !DILocation(line: 16, column: 3, scope: !20)
+!22 = !DILocation(line: 17, column: 14, scope: !20)
+!23 = !DILocation(line: 20, column: 3, scope: !20)
+!24 = !DILocation(line: 17, column: 19, scope: !20)
+!25 = !DILocation(line: 17, column: 11, scope: !20)
+!26 = distinct !{!26, !27, !18}
+!27 = !{!"llvm.loop.vectorize.enable", i1 true}
+!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
+                             file: !5,
+                             isOptimized: true, flags: "-O2",
+                             splitDebugFilename: "abc.debug", emissionKind: 2)
+!29 = !{!"function_entry_count", i64 3}
+!30 = !{!"branch_weights", i32 99, i32 1}
+!31 = !{!"branch_weights", i32 1, i32 99}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
new file mode 100644
index 00000000000..9fa65534f32
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll
@@ -0,0 +1,198 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; This test verifies that the loop vectorizer will NOT vectorize loops that
+; will produce a tail loop with the optimize for size or the minimize size
+; attributes. This is a target-dependent version of the test.
+; RUN: opt < %s -loop-vectorize -force-vector-width=64 -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s
+; RUN: opt < %s -loop-vectorize -S -mtriple=x86_64-unknown-linux -mcpu=skx | FileCheck %s --check-prefix AUTOVF
+
+target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
+
+@tab = common global [32 x i8] zeroinitializer, align 1
+
+define i32 @foo_optsize() #0 {
+; CHECK-LABEL: @foo_optsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { optsize }
+
+define i32 @foo_minsize() #1 {
+; CHECK-LABEL: @foo_minsize(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* [[TMP4]], i32 1, <64 x i1> [[TMP2]], <64 x i8> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <64 x i8> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <64 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>, <64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP7]], <64 x i8>* [[TMP8]], i32 1, <64 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
+; CHECK-NEXT:    store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !5
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %cmp1 = icmp eq i8 %0, 0
+  %. = select i1 %cmp1, i8 2, i8 1
+  store i8 %., i8* %arrayidx, align 1
+  %inc = add nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %i.08, 202
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #1 = { minsize }
+
+
+; We can't vectorize this one because we version for stride==1; even having TC
+; a multiple of VF.
+; CHECK-LABEL: @scev4stride1
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.body:
+; AUTOVF-LABEL: @scev4stride1
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.body:
+define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = mul nsw i32 %i.07, %k
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07
+  store i32 %0, i32* %arrayidx1, align 4
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 256
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+attributes #2 = { optsize }
+
+
+; PR39497
+; We can't vectorize this one because we version for overflow check and tiny
+; trip count leads to opt-for-size (which otherwise could fold the tail by
+; masking).
+; CHECK-LABEL: @main
+; CHECK-NOT: vector.scevcheck
+; CHECK-NOT: vector.body:
+; CHECK-LABEL: for.cond:
+; AUTOVF-LABEL: @main
+; AUTOVF-NOT: vector.scevcheck
+; AUTOVF-NOT: vector.body:
+; AUTOVF-LABEL: for.cond:
+define i32 @main() local_unnamed_addr {
+while.cond:
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32 [ 0, %while.cond ], [ %add, %for.cond ]
+  %conv = and i32 %d.0, 65535
+  %cmp = icmp ult i32 %conv, 4
+  %add = add nuw nsw i32 %conv, 1
+  br i1 %cmp, label %for.cond, label %while.cond.loopexit
+
+while.cond.loopexit:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
new file mode 100644
index 00000000000..3f22b56290e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -0,0 +1,114 @@
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+; RUN: opt -S -loop-vectorize -enable-vplan-native-path -mtriple x86_64  -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
+
+; extern int arr[8][8];
+; extern int arr2[8];
+;
+; void foo(int n)
+; {
+;   int i1, i2;
+;
+; #pragma clang loop vectorize(enable)
+;   for (i1 = 0; i1 < 8; i1++) {
+;     arr2[i1] = i1;
+;     for (i2 = 0; i2 < 8; i2++)
+;       arr[i2][i1] = i1 + n;
+;   }
+; }
+;
+
+; CHECK-LABEL: vector.ph:
+; CHECK: %[[SplatVal:.*]] = insertelement <4 x i32> undef, i32 %n, i32 0
+; CHECK: %[[Splat:.*]] = shufflevector <4 x i32> %[[SplatVal]], <4 x i32> undef, <4 x i32> zeroinitializer
+
+; CHECK-LABEL: vector.body:
+; CHECK: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; CHECK: %[[VecInd:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; CHECK: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <4 x i64> %[[VecInd]]
+; CHECK: %[[VecIndTr:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[VecIndTr]], <4 x i32*> %[[AAddr]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK: %[[VecIndTr2:.*]] = trunc <4 x i64> %[[VecInd]] to <4 x i32>
+; CHECK: %[[StoreVal:.*]] = add nsw <4 x i32> %[[VecIndTr2]], %[[Splat]]
+; CHECK: br label %[[InnerLoop:.+]]
+
+; CHECK: [[InnerLoop]]:
+; CHECK: %[[InnerPhi:.*]] = phi <4 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; CHECK: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <4 x i64> %[[InnerPhi]], <4 x i64> %[[VecInd]]
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %[[StoreVal]], <4 x i32*> %[[AAddr2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true
+; CHECK: %[[InnerPhiNext]] = add nuw nsw <4 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK: %[[VecCond:.*]] = icmp eq <4 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8>
+; CHECK: %[[InnerCond:.*]] = extractelement <4 x i1> %[[VecCond]], i32 0
+; CHECK: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; CHECK: [[ForInc]]:
+; CHECK: %[[IndNext]] = add i64 %[[Ind]], 4
+; CHECK: %[[VecIndNext]] = add <4 x i64> %[[VecInd]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; CHECK: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+; AVX-LABEL: vector.ph:
+; AVX: %[[SplatVal:.*]] = insertelement <8 x i32> undef, i32 %n, i32 0
+; AVX: %[[Splat:.*]] = shufflevector <8 x i32> %[[SplatVal]], <8 x i32> undef, <8 x i32> zeroinitializer
+
+; AVX-LABEL: vector.body:
+; AVX: %[[Ind:.*]] = phi i64 [ 0, %vector.ph ], [ %[[IndNext:.*]], %[[ForInc:.*]] ]
+; AVX: %[[VecInd:.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %[[VecIndNext:.*]], %[[ForInc]] ]
+; AVX: %[[AAddr:.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, <8 x i64> %[[VecInd]]
+; AVX: %[[VecIndTr:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
+; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[VecIndTr]], <8 x i32*> %[[AAddr]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; AVX: %[[VecIndTr2:.*]] = trunc <8 x i64> %[[VecInd]] to <8 x i32>
+; AVX: %[[StoreVal:.*]] = add nsw <8 x i32> %[[VecIndTr2]], %[[Splat]]
+; AVX: br label %[[InnerLoop:.+]]
+
+; AVX: [[InnerLoop]]:
+; AVX: %[[InnerPhi:.*]] = phi <8 x i64> [ %[[InnerPhiNext:.*]], %[[InnerLoop]] ], [ zeroinitializer, %vector.body ]
+; AVX: %[[AAddr2:.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, <8 x i64> %[[InnerPhi]], <8 x i64> %[[VecInd]]
+; AVX: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %[[StoreVal]], <8 x i32*> %[[AAddr2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true
+; AVX: %[[InnerPhiNext]] = add nuw nsw <8 x i64> %[[InnerPhi]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; AVX: %[[VecCond:.*]] = icmp eq <8 x i64> %[[InnerPhiNext]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; AVX: %[[InnerCond:.*]] = extractelement <8 x i1> %[[VecCond]], i32 0
+; AVX: br i1 %[[InnerCond]], label %[[ForInc]], label %[[InnerLoop]]
+
+; AVX: [[ForInc]]:
+; AVX: %[[IndNext]] = add i64 %[[Ind]], 8
+; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; AVX: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
+; AVX: br i1 %[[Cmp]], label %middle.block, label %vector.body
+
+@arr2 = external global [8 x i32], align 16
+@arr = external global [8 x [8 x i32]], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc8, %entry
+  %indvars.iv21 = phi i64 [ 0, %entry ], [ %indvars.iv.next22, %for.inc8 ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @arr2, i64 0, i64 %indvars.iv21
+  %0 = trunc i64 %indvars.iv21 to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %1 = trunc i64 %indvars.iv21 to i32
+  %add = add nsw i32 %1, %n
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body ], [ %indvars.iv.next, %for.body3 ]
+  %arrayidx7 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* @arr, i64 0, i64 %indvars.iv, i64 %indvars.iv21
+  store i32 %add, i32* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.inc8, label %for.body3
+
+for.inc8:                                         ; preds = %for.body3
+  %indvars.iv.next22 = add nuw nsw i64 %indvars.iv21, 1
+  %exitcond23 = icmp eq i64 %indvars.iv.next22, 8
+  br i1 %exitcond23, label %for.end10, label %for.body, !llvm.loop !1
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
new file mode 100644
index 00000000000..0e6f06497e3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll
@@ -0,0 +1,50 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The parallel loop has been invalidated by the new memory accesses introduced
+; by reg2mem (Loop::isParallel() starts to return false). Ensure the loop is
+; now non-vectorizable.
+
+;CHECK-NOT: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  %indvars.iv.next.reg2mem = alloca i64
+  %indvars.iv.reg2mem = alloca i64
+  %"reg2mem alloca point" = bitcast i32 0 to i32
+  store i64 0, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
+  %indvars.iv.reload = load i64, i64* %indvars.iv.reg2mem
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.reload
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.reload
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !4
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !4
+  %indvars.iv.next = add i64 %indvars.iv.reload, 1
+  ; A new store without the parallel metadata here:
+  store i64 %indvars.iv.next, i64* %indvars.iv.next.reg2mem
+  %indvars.iv.next.reload1 = load i64, i64* %indvars.iv.next.reg2mem
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next.reload1
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !4
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !4
+  %indvars.iv.next.reload = load i64, i64* %indvars.iv.next.reg2mem
+  %lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop !3
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %indvars.iv.next.reload2 = load i64, i64* %indvars.iv.next.reg2mem
+  store i64 %indvars.iv.next.reload2, i64* %indvars.iv.reg2mem
+  br label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !4}}
+!4 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
new file mode 100644
index 00000000000..be778850096
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/parallel-loops.ll
@@ -0,0 +1,115 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; A tricky loop:
+;
+; void loop(int *a, int *b) {
+;    for (int i = 0; i < 512; ++i) {
+;        a[a[i]] = b[i];
+;        a[i] = b[i+1];
+;    }
+;}
+
+;CHECK-LABEL: @loop(
+;CHECK-NOT: <4 x i32>
+define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  store i32 %0, i32* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4
+  store i32 %2, i32* %arrayidx2, align 4
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with parallel loop metadata added to the loop branch
+; and the memory instructions.
+
+;CHECK-LABEL: @parallel_loop(
+;CHECK: <4 x i32>
+define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !13
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  ; This store might have originated from inlining a function with a parallel
+  ; loop. Refers to a list with the "original loop reference" (!4) also included.
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !15
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !13
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !13
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; The same loop with an illegal parallel loop metadata: the memory
+; accesses refer to a different loop's identifier.
+
+;CHECK-LABEL: @mixed_metadata(
+;CHECK-NOT: <4 x i32>
+
+define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !llvm.access.group !16
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4, !llvm.access.group !16
+  %idxprom3 = sext i32 %1 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %idxprom3
+  ; This refers to the loop marked with !7 which we are not in at the moment.
+  ; It should prevent detecting as a parallel loop.
+  store i32 %0, i32* %arrayidx4, align 4, !llvm.access.group !17
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx6 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx6, align 4, !llvm.access.group !16
+  store i32 %2, i32* %arrayidx2, align 4, !llvm.access.group !16
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !6
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13, !15}}
+!4 = !{!4, !{!"llvm.loop.parallel_accesses", !14, !15}}
+!6 = !{!6, !{!"llvm.loop.parallel_accesses", !16}}
+!7 = !{!7, !{!"llvm.loop.parallel_accesses", !17}}
+!13 = distinct !{}
+!14 = distinct !{}
+!15 = distinct !{}
+!16 = distinct !{}
+!17 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/powof2div.ll b/llvm/test/Transforms/LoopVectorize/X86/powof2div.ll
new file mode 100644
index 00000000000..3e4bef6d4d0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/powof2div.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.anon = type { [100 x i32], i32, [100 x i32] }
+
+@Foo = common global %struct.anon zeroinitializer, align 4
+
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>, <4 x i32>*
+; CHECK: sdiv <4 x i32>
+; CHECK: store <4 x i32>
+
+define void @foo(){
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %div = sdiv i32 %0, 2
+  %arrayidx2 = getelementptr inbounds %struct.anon, %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv
+  store i32 %div, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
new file mode 100644
index 00000000000..6b61ddbc413
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Ensure that the 'inbounds' is preserved on the GEPs that feed the load and store in the loop.
+define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrspace(1)* align 8 dereferenceable_or_null(8), i64) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[DOT10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP0:%.*]], i64 16
+; CHECK-NEXT:    [[DOT11:%.*]] = bitcast i8 addrspace(1)* [[DOT10]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16
+; CHECK-NEXT:    [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2:%.*]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], 1
+; CHECK-NEXT:    [[UMAX1:%.*]] = select i1 [[TMP4]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[UMAX1]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 [[TMP6]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[DOT10]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[DOT12]], [[SCEVGEP]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[UMAX]], -16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP7]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP8]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 4
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP9]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP10]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP11]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP12]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 12
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP13]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP14]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP15]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP16]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP17]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD6]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP18]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP19]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD7]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP20]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 12
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP21]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD8]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP22]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DOT18:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    [[V:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT18]], align 8
+; CHECK-NEXT:    [[DOT20:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    store i8 addrspace(1)* [[V]], i8 addrspace(1)* addrspace(1)* [[DOT20]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1
+; CHECK-NEXT:    [[DOT21:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT4]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop !7
+; CHECK:       loopexit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %preheader
+
+preheader:                                       ; preds = %4
+  %.10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 16
+  %.11 = bitcast i8 addrspace(1)* %.10 to i8 addrspace(1)* addrspace(1)*
+  %.12 = getelementptr inbounds i8, i8 addrspace(1)* %1, i64 16
+  %.13 = bitcast i8 addrspace(1)* %.12 to i8 addrspace(1)* addrspace(1)*
+  br label %loop
+
+loop:
+  %indvars.iv3 = phi i64 [ 0, %preheader ], [ %indvars.iv.next4, %loop ]
+  %.18 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.13, i64 %indvars.iv3
+  %v = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.18, align 8
+  %.20 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.11, i64 %indvars.iv3
+  store i8 addrspace(1)* %v, i8 addrspace(1)* addrspace(1)* %.20, align 8
+  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
+  %.21 = icmp ult i64 %indvars.iv.next4, %2
+  br i1 %.21, label %loop, label %loopexit
+
+loopexit:
+  ret void
+}
+
+attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+
+!0 = !{i32 0, i32 2147483646}
+!1 = !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
new file mode 100644
index 00000000000..efbfecc2f54
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -0,0 +1,36 @@
+; PR34438
+; Loop has a short trip count of 8 iterations. It should be vectorized because no runtime checks or tail loop are necessary.
+; Two cases tested AVX (MaxVF=8 = TripCount) and AVX512 (MaxVF=16 > TripCount)
+
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=skylake-avx512 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @small_tc(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @small_tc
+; CHECK:    load <8 x float>, <8 x float>*
+; CHECK:    fadd fast <8 x float>
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !5
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !5
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !5}}
+!4 = !{!4}
+!5 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
new file mode 100644
index 00000000000..1f2a2061586
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s
+
+; The test checks that there is no assert caused by issue described in PR35432
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common local_unnamed_addr global [192 x [192 x i32]] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @main() local_unnamed_addr #0 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[S:%.*]] = alloca i16, align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[I]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[S]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull [[I]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[I]], align 4
+; CHECK-NEXT:    [[STOREMERGE6:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[STOREMERGE6]], i16* [[S]], align 2
+; CHECK-NEXT:    [[CONV17:%.*]] = and i32 [[TMP2]], 65472
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp eq i32 [[CONV17]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END12:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 -1, [[TMP2]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[STOREMERGE_IN9:%.*]] = phi i32 [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_INC9:%.*]] ]
+; CHECK-NEXT:    [[CONV52:%.*]] = and i32 [[STOREMERGE_IN9]], 255
+; CHECK-NEXT:    [[CMP63:%.*]] = icmp ult i32 [[TMP2]], [[CONV52]]
+; CHECK-NEXT:    br i1 [[CMP63]], label [[FOR_BODY8_LR_PH:%.*]], label [[FOR_INC9]]
+; CHECK:       for.body8.lr.ph:
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[STOREMERGE_IN9]] to i8
+; CHECK-NEXT:    [[DOTPROMOTED:%.*]] = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 -1, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP7]], i32 [[TMP3]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[UMAX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i8 [[CONV3]], -1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 -1, [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt i32 [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[UMAX1:%.*]] = select i1 [[TMP13]], i32 [[TMP3]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[UMAX1]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
+; CHECK-NEXT:    [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP16]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP17:%.*]] = add i8 [[TMP10]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sub i8 [[TMP10]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ugt i8 [[TMP18]], [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i8 [[TMP17]], [[TMP10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 true, i1 [[TMP19]], i1 [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ugt i32 [[TMP15]], 255
+; CHECK-NEXT:    [[TMP23:%.*]] = or i1 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i1 [[TMP23]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i1 false, [[TMP24]]
+; CHECK-NEXT:    br i1 [[TMP25]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i8 [[CONV3]], [[CAST_CRD]]
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[DOTPROMOTED]], i32 0
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP26]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = trunc i32 [[INDEX]] to i8
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i8 [[CONV3]], [[TMP27]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 0, i8 -1, i8 -2, i8 -3>
+; CHECK-NEXT:    [[INDUCTION3:%.*]] = add <4 x i8> [[BROADCAST_SPLAT]], <i8 -4, i8 -5, i8 -6, i8 -7>
+; CHECK-NEXT:    [[TMP28:%.*]] = add i8 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = add i8 [[OFFSET_IDX]], -4
+; CHECK-NEXT:    [[TMP30]] = add <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP31]] = add <4 x i32> [[VEC_PHI2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP32:%.*]] = add i8 [[TMP28]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = add i8 [[TMP29]], -1
+; CHECK-NEXT:    [[TMP34:%.*]] = zext i8 [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP31]], [[TMP30]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX4]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY8:%.*]]
+; CHECK:       for.body8:
+; CHECK-NEXT:    [[INC5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY8]] ]
+; CHECK-NEXT:    [[C_04:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[DEC:%.*]], [[FOR_BODY8]] ]
+; CHECK-NEXT:    [[INC]] = add i32 [[INC5]], 1
+; CHECK-NEXT:    [[DEC]] = add i8 [[C_04]], -1
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[DEC]] to i32
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp ult i32 [[TMP2]], [[CONV5]]
+; CHECK-NEXT:    br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop !2
+; CHECK:       for.cond4.for.inc9_crit_edge:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY8]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i32 [[INC_LCSSA]], i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+; CHECK-NEXT:    br label [[FOR_INC9]]
+; CHECK:       for.inc9:
+; CHECK-NEXT:    [[CONV10:%.*]] = and i32 [[STOREMERGE_IN9]], 65535
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[CONV10]], 1
+; CHECK-NEXT:    [[CONV1:%.*]] = and i32 [[ADD]], 65472
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV1]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END12_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end12_crit_edge:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INC9]] ]
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = trunc i32 [[ADD_LCSSA]] to i16
+; CHECK-NEXT:    store i16 [[STOREMERGE]], i16* [[S]], align 2
+; CHECK-NEXT:    br label [[FOR_END12]]
+; CHECK:       for.end12:
+; CHECK-NEXT:    [[CALL13:%.*]] = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull [[S]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull [[TMP1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP0]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %i = alloca i32, align 4
+  %s = alloca i16, align 2
+  %0 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #3
+  store i32 0, i32* %i, align 4
+  %1 = bitcast i16* %s to i8*
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull %1) #3
+  %call = call i32 (i32*, ...) bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i) #3
+  %2 = load i32, i32* %i, align 4
+  %storemerge6 = trunc i32 %2 to i16
+  store i16 %storemerge6, i16* %s, align 2
+  %conv17 = and i32 %2, 65472
+  %cmp8 = icmp eq i32 %conv17, 0
+  br i1 %cmp8, label %for.body.lr.ph, label %for.end12
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc9
+  %storemerge.in9 = phi i32 [ %2, %for.body.lr.ph ], [ %add, %for.inc9 ]
+  %conv52 = and i32 %storemerge.in9, 255
+  %cmp63 = icmp ult i32 %2, %conv52
+  br i1 %cmp63, label %for.body8.lr.ph, label %for.inc9
+
+for.body8.lr.ph:                                  ; preds = %for.body
+  %conv3 = trunc i32 %storemerge.in9 to i8
+  %.promoted = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.body8.lr.ph, %for.body8
+  %inc5 = phi i32 [ %.promoted, %for.body8.lr.ph ], [ %inc, %for.body8 ]
+  %c.04 = phi i8 [ %conv3, %for.body8.lr.ph ], [ %dec, %for.body8 ]
+  %inc = add i32 %inc5, 1
+  %dec = add i8 %c.04, -1
+  %conv5 = zext i8 %dec to i32
+  %cmp6 = icmp ult i32 %2, %conv5
+  br i1 %cmp6, label %for.body8, label %for.cond4.for.inc9_crit_edge
+
+for.cond4.for.inc9_crit_edge:                     ; preds = %for.body8
+  %inc.lcssa = phi i32 [ %inc, %for.body8 ]
+  store i32 %inc.lcssa, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16
+  br label %for.inc9
+
+for.inc9:                                         ; preds = %for.cond4.for.inc9_crit_edge, %for.body
+  %conv10 = and i32 %storemerge.in9, 65535
+  %add = add nuw nsw i32 %conv10, 1
+  %conv1 = and i32 %add, 65472
+  %cmp = icmp eq i32 %conv1, 0
+  br i1 %cmp, label %for.body, label %for.cond.for.end12_crit_edge
+
+for.cond.for.end12_crit_edge:                     ; preds = %for.inc9
+  %add.lcssa = phi i32 [ %add, %for.inc9 ]
+  %storemerge = trunc i32 %add.lcssa to i16
+  store i16 %storemerge, i16* %s, align 2
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.cond.for.end12_crit_edge, %entry
+  %call13 = call i32 (i16*, ...) bitcast (i32 (...)* @foo to i32 (i16*, ...)*)(i16* nonnull %s) #3
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull %1) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #3
+  ret i32 0
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare i32 @goo(...) local_unnamed_addr #2
+
+declare i32 @foo(...) local_unnamed_addr #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
new file mode 100644
index 00000000000..51d70d773a6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr36524.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 2, i64 3, i64 4, i64 5>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[OFFSET_IDX1:%.*]] = add i64 2, [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[OFFSET_IDX1]] to i32
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP11]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+;
+entry:
+  br label %loop
+
+loop:
+  %0 = phi i64 [ 2, %entry ], [ %3, %loop ]
+  %1 = and i64 %0, 4294967295
+  %2 = trunc i64 %0 to i32
+  %3 = add nuw nsw i64 %1, 1
+  %4 = icmp sgt i32 %2, 80
+  br i1 %4, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
new file mode 100644
index 00000000000..38ca5f964a0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr39160.ll
@@ -0,0 +1,98 @@
+; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Make sure that we can compile the test without crash.
+define void @barney() {
+
+; CHECK-LABEL: @barney(
+; CHECK:       middle.block:
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb
+  %tmp4 = icmp slt i32 undef, 0
+  br i1 %tmp4, label %bb2, label %bb5
+
+bb5:                                              ; preds = %bb2
+  br label %bb19
+
+bb18:                                             ; preds = %bb33
+  ret void
+
+bb19:                                             ; preds = %bb36, %bb5
+  %tmp21 = phi i64 [ undef, %bb36 ], [ 2, %bb5 ]
+  %tmp22 = phi i32 [ %tmp65, %bb36 ], [ undef, %bb5 ]
+  br label %bb50
+
+bb33:                                             ; preds = %bb62
+  br i1 undef, label %bb18, label %bb36
+
+bb36:                                             ; preds = %bb33
+  br label %bb19
+
+bb46:                                             ; preds = %bb50
+  br i1 undef, label %bb48, label %bb59
+
+bb48:                                             ; preds = %bb46
+  %tmp49 = add i32 %tmp52, 14
+  ret void
+
+bb50:                                             ; preds = %bb50, %bb19
+  %tmp52 = phi i32 [ %tmp55, %bb50 ], [ %tmp22, %bb19 ]
+  %tmp53 = phi i64 [ %tmp56, %bb50 ], [ 1, %bb19 ]
+  %tmp54 = add i32 %tmp52, 12
+  %tmp55 = add i32 %tmp52, 13
+  %tmp56 = add nuw nsw i64 %tmp53, 1
+  %tmp58 = icmp ult i64 %tmp53, undef
+  br i1 %tmp58, label %bb50, label %bb46
+
+bb59:                                             ; preds = %bb46
+  br label %bb62
+
+bb62:                                             ; preds = %bb68, %bb59
+  %tmp63 = phi i32 [ %tmp65, %bb68 ], [ %tmp55, %bb59 ]
+  %tmp64 = phi i64 [ %tmp66, %bb68 ], [ %tmp56, %bb59 ]
+  %tmp65 = add i32 %tmp63, 13
+  %tmp66 = add nuw nsw i64 %tmp64, 1
+  %tmp67 = icmp ult i64 %tmp66, %tmp21
+  br i1 %tmp67, label %bb68, label %bb33
+
+bb68:                                             ; preds = %bb62
+  br label %bb62
+}
+
+define i32 @foo(i32 addrspace(1)* %p) {
+
+; CHECK-LABEL: foo
+; CHECK:       middle.block:
+
+entry:
+  br label %outer
+
+outer:                                            ; preds = %outer_latch, %entry
+  %iv = phi i64 [ 2, %entry ], [ %iv.next, %outer_latch ]
+  br label %inner
+
+inner:                                            ; preds = %inner, %outer
+  %0 = phi i32 [ %2, %inner ], [ 0, %outer ]
+  %a = phi i32 [ %3, %inner ], [ 1, %outer ]
+  %b = phi i32 [ %1, %inner ], [ 6, %outer ]
+  %1 = add i32 %b, 2
+  %2 = or i32 %0, %b
+  %3 = add nuw nsw i32 %a, 1
+  %4 = zext i32 %3 to i64
+  %5 = icmp ugt i64 %iv, %4
+  br i1 %5, label %inner, label %outer_latch
+
+outer_latch:                                      ; preds = %inner
+  store atomic i32 %2, i32 addrspace(1)* %p unordered, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %6 = icmp ugt i64 %iv, 63
+  br i1 %6, label %exit, label %outer
+
+exit:                                             ; preds = %outer_latch
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/propagate-metadata.ll b/llvm/test/Transforms/LoopVectorize/X86/propagate-metadata.ll
new file mode 100644
index 00000000000..2825ddbac9d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/propagate-metadata.ll
@@ -0,0 +1,25 @@
+; RUN: opt -S -mtriple="x86_64-unknown-linux-gnu" -loop-vectorize < %s | FileCheck %s
+
+; Don't crash on unknown metadata
+; CHECK-LABEL: @no_propagate_range_metadata(
+; CHECK: load <16 x i8>
+; CHECK: store <16 x i8>
+define void @no_propagate_range_metadata(i8* readonly %first.coerce, i8* readnone %last.coerce, i8* nocapture %result) {
+for.body.preheader:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %result.addr.05 = phi i8* [ %incdec.ptr, %for.body ], [ %result, %for.body.preheader ]
+  %first.sroa.0.04 = phi i8* [ %incdec.ptr.i.i.i, %for.body ], [ %first.coerce, %for.body.preheader ]
+  %0 = load i8, i8* %first.sroa.0.04, align 1, !range !0
+  store i8 %0, i8* %result.addr.05, align 1
+  %incdec.ptr.i.i.i = getelementptr inbounds i8, i8* %first.sroa.0.04, i64 1
+  %incdec.ptr = getelementptr inbounds i8, i8* %result.addr.05, i64 1
+  %lnot.i = icmp eq i8* %incdec.ptr.i.i.i, %last.coerce
+  br i1 %lnot.i, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  ret void
+}
+
+!0 = !{i8 0, i8 2}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll
new file mode 100644
index 00000000000..13ceaef4dbb
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/ptr-indvar-crash.ll
@@ -0,0 +1,20 @@
+; RUN: opt -loop-vectorize -S %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f(i128 %p1) {
+entry:
+  br label %while.body
+
+while.body:
+  %p.05 = phi i8* [ %add.ptr, %while.body ], [ null, %entry ]
+  %p1.addr.04 = phi i128 [ %sub, %while.body ], [ %p1, %entry ]
+  %add.ptr = getelementptr inbounds i8, i8* %p.05, i32 2
+  %sub = add nsw i128 %p1.addr.04, -2
+  %tobool = icmp eq i128 %sub, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
new file mode 100644
index 00000000000..4284fbacfa7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/rauw-bug.ll
@@ -0,0 +1,33 @@
+; RUN: opt -slp-vectorizer -S %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; This test used to fail under libgmalloc. Because we would try to access a
+; pointer that was already deleted.
+;
+; llvm-lit -v --param use_gmalloc=1 --param
+;   gmalloc_path=/usr/lib/libgmalloc.dylib
+;   test/Transforms/LoopVectorize/X86/rauw-bug.ll
+;
+; radar://15498655
+
+; CHECK: reduced
+define void @reduced()  {
+entry:
+  br i1 undef, label %while.body, label %while.cond63.preheader.while.end76_crit_edge
+
+while.cond63.preheader.while.end76_crit_edge:
+  ret void
+
+while.body:
+  %d2_fx.015 = phi double [ %sub52, %while.body ], [ undef, %entry ]
+  %d2_fy.014 = phi double [ %sub58, %while.body ], [ undef, %entry ]
+  %d3_fy.013 = phi double [ %div56, %while.body ], [ undef, %entry ]
+  %d3_fx.012 = phi double [ %div50, %while.body ], [ undef, %entry ]
+  %div50 = fmul double %d3_fx.012, 1.250000e-01
+  %sub52 = fsub double 0.000000e+00, %div50
+  %div56 = fmul double %d3_fy.013, 1.250000e-01
+  %sub58 = fsub double 0.000000e+00, %div56
+  br label %while.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
new file mode 100644
index 00000000000..6393002d507
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -loop-vectorize -mcpu=prescott -disable-basicaa < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-apple-darwin"
+
+; PR15344
+define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
+; CHECK-LABEL: @test1(
+; CHECK: preheader
+; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: vector.memcheck
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb
+  %tmp = load double, double* null, align 8
+  br i1 undef, label %bb3, label %bb12
+
+bb3:                                              ; preds = %bb3, %bb2
+  %tmp4 = phi double [ %tmp9, %bb3 ], [ %tmp, %bb2 ]
+  %tmp5 = phi i32 [ %tmp8, %bb3 ], [ 0, %bb2 ]
+  %tmp6 = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 %tmp5
+  %tmp7 = load double, double* %tmp6, align 4
+  %tmp8 = add nsw i32 %tmp5, 1
+  %tmp9 = fadd fast double %tmp4, undef
+  %tmp10 = getelementptr inbounds float, float* %arg, i32 %tmp5
+  store float undef, float* %tmp10, align 4
+  %tmp11 = icmp eq i32 %tmp8, %arg1
+  br i1 %tmp11, label %bb12, label %bb3
+
+bb12:                                             ; preds = %bb3, %bb2
+  %tmp13 = phi double [ %tmp, %bb2 ], [ %tmp9, %bb3 ]
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
new file mode 100644
index 00000000000..1146e31ec25
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -0,0 +1,112 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @reduction_sum_float_ieee(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_ieee(
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK-NOT: %wide.load = load <4 x float>, <4 x float>*
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_fastmath(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_fastmath(
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+; CHECK: fadd fast <4 x float>
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd fast float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_only_reassoc(
+; CHECK-NOT: fadd fast
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+; CHECK: fadd reassoc <4 x float>
+
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd reassoc float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
+
+define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %array) {
+; CHECK-LABEL: define float @reduction_sum_float_only_reassoc_and_contract(
+; CHECK-NOT: fadd fast
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+; CHECK: fadd reassoc contract <4 x float>
+
+entry:
+  %entry.cond = icmp ne i32 0, 4096
+  br i1 %entry.cond, label %loop, label %loop.exit
+
+loop:
+  %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ]
+  %address = getelementptr float, float* %array, i32 %idx
+  %value = load float, float* %address
+  %sum.inc = fadd reassoc contract float %sum, %value
+  %idx.inc = add i32 %idx, 1
+  %be.cond = icmp ne i32 %idx.inc, 4096
+  br i1 %be.cond, label %loop, label %loop.exit
+
+loop.exit:
+  %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ]
+; CHECK: ret float %sum.lcssa
+  ret float %sum.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
new file mode 100644
index 00000000000..7c29faa51e6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-small-size.ll
@@ -0,0 +1,80 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -debug-only=loop-vectorize -S < %s 2>&1  | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Make sure we ignore the costs of the redundant reduction casts
+; char reduction_i8(char *a, char *b, int n) {
+;   char sum = 0;
+;   for (int i = 0; i < n; ++i)
+;     sum += (a[i] + b[i]);
+;   return sum;
+; }
+;
+
+; CHECK-LABEL: reduction_i8
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = load
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = trunc
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   %{{.*}} = icmp
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 1 For instruction:   br
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = phi
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = getelementptr
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = load
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = zext i8 %{{.*}} to i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = and i32 %{{.*}}, 255
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = add
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = trunc
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{.*}} = icmp
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   br
+;
+define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
+entry:
+  %cmp.12 = icmp sgt i32 %n, 0
+  br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.for.cond.cleanup_crit_edge:
+  %add5.lcssa = phi i32 [ %add5, %for.body ]
+  %conv6 = trunc i32 %add5.lcssa to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i8 [ %conv6, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ]
+  ret i8 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %sum.013 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %conv4 = and i32 %sum.013, 255
+  %add = add nuw nsw i32 %conv, %conv4
+  %add5 = add nuw nsw i32 %add, %conv3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
new file mode 100644
index 00000000000..bd69d27d2d5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/redundant-vf2-cost.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -mtriple x86_64 -debug -disable-output 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Check that cost model is not executed twice for VF=2 when vectorization is
+; forced for a particular loop.
+
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
+; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   %{{[0-9]+}} = load i32
+; CHECK-NOT: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction:   store i32
+; CHECK: LV: Vector loop of width 2 costs: {{[0-9]+}}.
+
+define i32 @foo(i32* %A, i32 %n) {
+entry:
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %exit, label %for.body.i
+
+for.body.i:
+  %iv = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %ld_addr = getelementptr inbounds i32, i32* %A, i32 %iv
+  %0 = load i32, i32* %ld_addr, align 4
+  %val = add i32 %0, 1
+  store i32 %val, i32* %ld_addr, align 4
+  %add.i = add nsw i32 %iv, 1
+  %cmp.i = icmp eq i32 %add.i, %n
+  br i1 %cmp.i, label %exit, label %for.body.i, !llvm.loop !0
+
+exit:
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+!0 = !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
new file mode 100644
index 00000000000..8205092deff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll
@@ -0,0 +1,134 @@
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Test that the register usage estimation is not affected by the presence of
+; debug intrinsics.
+;
+; In the test below the values %0 and %r.08 are ended in the add instruction
+; preceding the call to the intrinsic, and will be recorded against the index
+; of the call instruction.  This means the debug intrinsic must be considered
+; when erasing instructions from the list of open-intervals.
+;
+; Tests generated from following source (with and without -g):
+
+; unsigned test(unsigned *a, unsigned n) {
+;   unsigned i, r = 0;
+;   for(i = 0; i < n; i++)
+;     r += a[i];
+;   return r;
+; }
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: LV: Checking a loop in "test_g"
+; CHECK: LV(REG): Found max usage: 2
+
+define i32 @test_g(i32* nocapture readonly %a, i32 %n) local_unnamed_addr !dbg !6 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %a, i64 0, metadata !12, metadata !16), !dbg !17
+  tail call void @llvm.dbg.value(metadata i32 %n, i64 0, metadata !13, metadata !16), !dbg !18
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !15, metadata !16), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !14, metadata !16), !dbg !20
+  %cmp6 = icmp eq i32 %n, 0, !dbg !21
+  br i1 %cmp6, label %for.end, label %for.body.preheader, !dbg !25
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64, !dbg !21
+  br label %for.body, !dbg !27
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv, !dbg !27
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !27, !tbaa !28
+  %add = add i32 %0, %r.08, !dbg !32
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 %add, i64 0, metadata !15, metadata !16), !dbg !19
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count, !dbg !21
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !25, !llvm.loop !35
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !38
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %r.0.lcssa, !dbg !38
+}
+
+; CHECK: LV: Checking a loop in "test"
+; CHECK: LV(REG): Found max usage: 2
+
+define i32 @test(i32* nocapture readonly %a, i32 %n) local_unnamed_addr {
+entry:
+  %cmp6 = icmp eq i32 %n, 0
+  br i1 %cmp6, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %r.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !28
+  %add = add i32 %0, %r.08
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %r.0.lcssa
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "test_g", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9, !10, !9}
+!9 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
+!11 = !{!12, !13, !14, !15}
+!12 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 1, type: !10)
+!13 = !DILocalVariable(name: "n", arg: 2, scope: !6, file: !1, line: 1, type: !9)
+!14 = !DILocalVariable(name: "i", scope: !6, file: !1, line: 2, type: !9)
+!15 = !DILocalVariable(name: "r", scope: !6, file: !1, line: 2, type: !9)
+!16 = !DIExpression()
+!17 = !DILocation(line: 1, column: 27, scope: !6)
+!18 = !DILocation(line: 1, column: 39, scope: !6)
+!19 = !DILocation(line: 2, column: 15, scope: !6)
+!20 = !DILocation(line: 2, column: 12, scope: !6)
+!21 = !DILocation(line: 3, column: 16, scope: !22)
+!22 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 1)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 3, column: 3)
+!24 = distinct !DILexicalBlock(scope: !6, file: !1, line: 3, column: 3)
+!25 = !DILocation(line: 3, column: 3, scope: !26)
+!26 = !DILexicalBlockFile(scope: !24, file: !1, discriminator: 1)
+!27 = !DILocation(line: 4, column: 10, scope: !23)
+!28 = !{!29, !29, i64 0}
+!29 = !{!"int", !30, i64 0}
+!30 = !{!"omnipotent char", !31, i64 0}
+!31 = !{!"Simple C/C++ TBAA"}
+!32 = !DILocation(line: 4, column: 7, scope: !23)
+!33 = !DILocation(line: 3, column: 22, scope: !34)
+!34 = !DILexicalBlockFile(scope: !23, file: !1, discriminator: 2)
+!35 = distinct !{!35, !36, !37}
+!36 = !DILocation(line: 3, column: 3, scope: !24)
+!37 = !DILocation(line: 4, column: 13, scope: !24)
+!38 = !DILocation(line: 5, column: 3, scope: !6)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
new file mode 100644
index 00000000000..9b276aa2bd7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
@@ -0,0 +1,135 @@
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F
+; REQUIRES: asserts
+
+@a = global [1024 x i8] zeroinitializer, align 16
+@b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @foo() {
+; This function has a loop of SAD pattern. Here we check when VF = 16 the
+; register usage doesn't exceed 16.
+;
+; CHECK-LABEL: foo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %2 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %2, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i32 @goo() {
+; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
+; it will not have vector version and the vector register usage will not exceed the
+; available vector register number.
+; CHECK-LABEL: goo
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %tmp1 = add nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1
+  %tmp = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %tmp to i32
+  %tmp2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2
+  %tmp3 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %tmp3 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %tmp4 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %tmp4, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i64 @bar(i64* nocapture %a) {
+; CHECK-LABEL: bar
+; CHECK:       LV(REG): VF = 2
+; CHECK:       LV(REG): Found max usage: 3
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add2.lcssa = phi i64 [ %add2, %for.body ]
+  ret i64 %add2.lcssa
+
+for.body:
+  %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.012
+  store i64 %add, i64* %arrayidx, align 8
+  %add2 = add nsw i64 %add, %s.011
+  %inc = add nuw nsw i64 %i.012, 1
+  %exitcond = icmp eq i64 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+@d = external global [0 x i64], align 8
+@e = external global [0 x i32], align 4
+@c = external global [0 x i32], align 4
+
+define void @hoo(i32 %n) {
+; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
+; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
+; so the max usage of AVX512 vector register will be 2.
+; AVX512F-LABEL: bar
+; AVX512F:       LV(REG): VF = 16
+; AVX512F:       LV(REG): Found max usage: 2
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp
+  %tmp1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv
+  store i32 %tmp1, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll b/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
new file mode 100644
index 00000000000..1add87db611
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/register-assumption.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -instcombine -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1() {
+entry:
+  %alloca = alloca float, align 4
+  br label %loop_exit.dim.11.critedge
+
+loop_exit.dim.11.critedge:                        ; preds = %loop_body.dim.0
+  %ptrint = ptrtoint float* %alloca to i64
+  %maskedptr = and i64 %ptrint, 4
+  %maskcond = icmp eq i64 %maskedptr, 0
+  br label %loop_header.dim.017.preheader
+
+loop_header.dim.017.preheader:                    ; preds = %loop_exit.dim.016, %loop_exit.dim.11.critedge
+  br label %loop_body.dim.018
+
+loop_body.dim.018:                                ; preds = %loop_body.dim.018, %loop_header.dim.017.preheader
+  %invar_address.dim.019.0135 = phi i64 [ 0, %loop_header.dim.017.preheader ], [ %0, %loop_body.dim.018 ]
+  call void @llvm.assume(i1 %maskcond)
+; CHECK:     call void @llvm.assume(
+; CHECK-NOT: call void @llvm.assume(
+  %0 = add nuw nsw i64 %invar_address.dim.019.0135, 1
+  %1 = icmp eq i64 %0, 256
+  br i1 %1, label %loop_header.dim.017.preheader, label %loop_body.dim.018
+}
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
new file mode 100755
index 00000000000..aff372b562f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+; This test checks vector GEP before scatter.
+; The code bellow crashed due to destroyed SSA while incorrect vectorization of
+; the GEP.
+
+@d = global [10 x [10 x i32]] zeroinitializer, align 16
+@c = external global i32, align 4
+@a = external global i32, align 4
+@b = external global i64, align 8
+
+; Function Attrs: norecurse nounwind ssp uwtable
+define void @_Z3fn1v() #0 {
+; CHECK-LABEL: @_Z3fn1v(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+entry:
+  %0 = load i32, i32* @c, align 4
+  %cmp34 = icmp sgt i32 %0, 8
+  br i1 %cmp34, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %1 = load i32, i32* @a, align 4
+  %tobool = icmp eq i32 %1, 0
+  %2 = load i64, i64* @b, align 8
+  %mul = mul i64 %2, 4063299859190
+  %tobool6 = icmp eq i64 %mul, 0
+  %3 = sext i32 %0 to i64
+  br i1 %tobool, label %for.body.us.preheader, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %for.body.lr.ph
+  br label %for.body
+
+for.body.us.preheader:                            ; preds = %for.body.lr.ph
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.body.us.preheader, %for.cond.cleanup4.us-lcssa.us.us
+  %indvars.iv78 = phi i64 [ %indvars.iv.next79, %for.cond.cleanup4.us-lcssa.us.us ], [ 8, %for.body.us.preheader ]
+  %indvars.iv70 = phi i64 [ %indvars.iv.next71, %for.cond.cleanup4.us-lcssa.us.us ], [ 0, %for.body.us.preheader ]
+  %4 = sub nsw i64 8, %indvars.iv78
+  %add.ptr.us = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 %indvars.iv78
+  %5 = add nsw i64 %4, %indvars.iv70
+  %arraydecay.us.us.us = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %5, i64 0
+  br i1 %tobool6, label %for.body5.us.us.us.preheader, label %for.body5.us.us48.preheader
+
+for.body5.us.us48.preheader:                      ; preds = %for.body.us
+  store i32 8, i32* %arraydecay.us.us.us, align 16
+  %indvars.iv.next66 = or i64 %indvars.iv70, 1
+  %6 = add nsw i64 %4, %indvars.iv.next66
+  %arraydecay.us.us55.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %6, i64 0
+  store i32 8, i32* %arraydecay.us.us55.1, align 8
+  br label %for.cond.cleanup4.us-lcssa.us.us
+
+for.body5.us.us.us.preheader:                     ; preds = %for.body.us
+  store i32 7, i32* %arraydecay.us.us.us, align 16
+  %indvars.iv.next73 = or i64 %indvars.iv70, 1
+  %7 = add nsw i64 %4, %indvars.iv.next73
+  %arraydecay.us.us.us.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr.us, i64 %7, i64 0
+  store i32 7, i32* %arraydecay.us.us.us.1, align 8
+  br label %for.cond.cleanup4.us-lcssa.us.us
+
+for.cond.cleanup4.us-lcssa.us.us:                 ; preds = %for.body5.us.us48.preheader, %for.body5.us.us.us.preheader
+  %indvars.iv.next79 = add nuw nsw i64 %indvars.iv78, 2
+  %cmp.us = icmp slt i64 %indvars.iv.next79, %3
+  %indvars.iv.next71 = add nuw nsw i64 %indvars.iv70, 2
+  br i1 %cmp.us, label %for.body.us, label %for.cond.cleanup.loopexit
+
+for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup4.us-lcssa.us.us
+  br label %for.cond.cleanup
+
+for.cond.cleanup.loopexit99:                      ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit99, %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv95 = phi i64 [ %indvars.iv.next96, %for.body ], [ 8, %for.body.preheader ]
+  %indvars.iv87 = phi i64 [ %indvars.iv.next88, %for.body ], [ 0, %for.body.preheader ]
+  %8 = sub nsw i64 8, %indvars.iv95
+  %add.ptr = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 %indvars.iv95
+  %9 = add nsw i64 %8, %indvars.iv87
+  %arraydecay.us31 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr, i64 %9, i64 0
+  store i32 8, i32* %arraydecay.us31, align 16
+  %indvars.iv.next90 = or i64 %indvars.iv87, 1
+  %10 = add nsw i64 %8, %indvars.iv.next90
+  %arraydecay.us31.1 = getelementptr inbounds [10 x i32], [10 x i32]* %add.ptr, i64 %10, i64 0
+  store i32 8, i32* %arraydecay.us31.1, align 8
+  %indvars.iv.next96 = add nuw nsw i64 %indvars.iv95, 2
+  %cmp = icmp slt i64 %indvars.iv.next96, %3
+  %indvars.iv.next88 = add nuw nsw i64 %indvars.iv87, 2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
+}
+
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
new file mode 100644
index 00000000000..cd3e89ae735
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
+; REQUIRES: asserts
+; This test should not be vectorized in X86\SLM arch
+; Vectorizing the 64bit multiply in this case is wrong since
+; it can be done with a lower bit mode (notice that the sources is 16bit)
+; Also addq\subq (quad word) has a high cost on SLM arch.
+; this test has a bad performance (regression of -70%) if vectorized on SLM arch
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
+entry:
+; MSG: LV: Selecting VF: 1. 
+  %cmp17 = icmp sgt i32 %LastIndex, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv5 = sext i16 %Scale to i64
+  %sh_prom = and i64 %conv5, 4294967295
+  %0 = sext i16 %lag to i64
+  %wide.trip.count = zext i32 %LastIndex to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %conv8 = trunc i64 %add7 to i32
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
+  ret i32 %Accumulator.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
+  %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv
+  %1 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %1 to i64
+  %2 = add nsw i64 %indvars.iv, %0
+  %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
+  %3 = load i16, i16* %arrayidx3, align 2 
+  %conv4 = sext i16 %3 to i64
+  %mul = mul nsw i64 %conv4, %conv
+  %shr = ashr i64 %mul, %sh_prom
+  %add7 = add i64 %shr, %Accumulator.018 
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
new file mode 100644
index 00000000000..e162a3a6b2b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -0,0 +1,408 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -loop-vectorize-with-block-frequency -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+; We can optimize this test without a tail.
+define void @example1() optsize {
+; CHECK-LABEL: @example1(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP9:%.*]]
+; CHECK:         br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Can vectorize in 'optsize' mode by masking the needed tail.
+define void @example2(i32 %n, i32 %x) optsize {
+; CHECK-LABEL: @example2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]]
+; CHECK:       .lr.ph5.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[X:%.*]], i32* [[TMP10]], align 16
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP14]], align 8
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    store i32 [[X]], i32* [[TMP16]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret void
+;
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32, i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32, i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; N is unknown, we need a tail. Can't vectorize because loop has no primary
+; induction.
+;CHECK-LABEL: @example3(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32, i32* %.023, i64 1
+  %4 = load i32, i32* %.023, align 16
+  %5 = getelementptr inbounds i32, i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; We can't vectorize this one because we need a runtime ptr check.
+;CHECK-LABEL: @example23(
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; We CAN vectorize this example because the pointers are marked as noalias.
+define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23b(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <4 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP6:%.*]]
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN vectorize this example by folding the tail it entails.
+define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+; CHECK-LABEL: @example23c(
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], <i64 257, i64 257, i64 257, i64 257>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK:       pred.load.if:
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; CHECK:       pred.load.continue:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; CHECK:       pred.load.if11:
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; CHECK:       pred.load.continue12:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; CHECK:       pred.load.if13:
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; CHECK:       pred.load.continue14:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]]
+; CHECK:       pred.load.if15:
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2
+; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; CHECK:       pred.load.continue16:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]]
+; CHECK-NEXT:    store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]]
+; CHECK-NEXT:    store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[TMP33:%.*]]
+; CHECK:         br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9
+; CHECK:         ret void
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; We CAN'T vectorize this example because it would entail a tail and an
+; induction is used outside the loop.
+define i64 @example23d(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+;CHECK-LABEL: @example23d(
+; CHECK-NOT: <4 x
+; CHECK: ret i64
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i64 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16, i16* %.04, i64 1
+  %3 = load i16, i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32, i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i64 %i.02, 1
+  %exitcond = icmp eq i64 %7, 257
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i64 %7
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
new file mode 100644
index 00000000000..645f3360543
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll
@@ -0,0 +1,54 @@
+; This test checks that the given loop still beneficial for vecotization
+; even if it contains scalarized load (gather on AVX2)
+;RUN: opt < %s -loop-vectorize -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readonly uwtable
+define i32 @matrix_row_col([100 x i32]* nocapture readonly %data, i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+  %idxprom = sext i32 %i to i64
+  %idxprom5 = sext i32 %j to i64
+  br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add7
+
+  for.body:                                         ; preds = %for.body, %entry
+  ; the loop gets vectorized
+  ; first consecutive load as vector load
+  ; CHECK: %wide.load = load <8 x i32>
+  ; second strided load scalarized
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+  ; CHECK: load i32
+
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.015 = phi i32 [ 0, %entry ], [ %add7, %for.body ]
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %idxprom, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx2, align 4, !tbaa !1
+  %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %data, i64 %indvars.iv, i64 %idxprom5
+  %1 = load i32, i32* %arrayidx6, align 4, !tbaa !1
+  %mul = mul nsw i32 %1, %0
+  %add = add i32 %sum.015, 4
+  %add7 = add i32 %add, %mul
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { "target-cpu"="core-avx2" "target-features"="+avx,+avx2,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (cfe/trunk 284570)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/struct-store.ll b/llvm/test/Transforms/LoopVectorize/X86/struct-store.ll
new file mode 100644
index 00000000000..4ff3b0e4c05
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/struct-store.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S
+
+; Make sure we are not crashing on this one.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glbl = external global [16 x { i64, i64 }], align 16
+
+declare void @fn()
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ]
+  %tmp = getelementptr inbounds [16 x { i64, i64 }], [16 x { i64, i64 }]* @glbl, i64 0, i64 %indvars.iv
+  store { i64, i64 } { i64 ptrtoint (void ()* @fn to i64), i64 0 }, { i64, i64 }* %tmp, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16
+  br i1 %exitcond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
new file mode 100644
index 00000000000..5a4bfe5e6bd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls-finite.ll
@@ -0,0 +1,187 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -S < %s | FileCheck %s
+
+; Test to verify that when math headers are built with
+; __FINITE_MATH_ONLY__ enabled, causing use of __<func>_finite
+; function versions, vectorization can map these to vector versions.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare float @__expf_finite(float) #0
+
+; CHECK-LABEL: @exp_f32
+; CHECK: <4 x float> @__svml_expf4
+; CHECK: ret
+define void @exp_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__expf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__exp_finite(double) #0
+
+; CHECK-LABEL: @exp_f64
+; CHECK: <4 x double> @__svml_exp4
+; CHECK: ret
+define void @exp_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__exp_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !11
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.vectorize.width", i32 4}
+!13 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+
+
+declare float @__logf_finite(float) #0
+
+; CHECK-LABEL: @log_f32
+; CHECK: <4 x float> @__svml_logf4
+; CHECK: ret
+define void @log_f32(float* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call fast float @__logf_finite(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %call, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!21 = distinct !{!21, !22, !23}
+!22 = !{!"llvm.loop.vectorize.width", i32 4}
+!23 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__log_finite(double) #0
+
+; CHECK-LABEL: @log_f64
+; CHECK: <4 x double> @__svml_log4
+; CHECK: ret
+define void @log_f64(double* nocapture %varray) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call fast double @__log_finite(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %call, double* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!31 = distinct !{!31, !32, !33}
+!32 = !{!"llvm.loop.vectorize.width", i32 4}
+!33 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare float @__powf_finite(float, float) #0
+
+; CHECK-LABEL: @pow_f32
+; CHECK: <4 x float> @__svml_powf4
+; CHECK: ret
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %indvars.iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call fast float @__powf_finite(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %indvars.iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!41 = distinct !{!41, !42, !43}
+!42 = !{!"llvm.loop.vectorize.width", i32 4}
+!43 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+declare double @__pow_finite(double, double) #0
+
+; CHECK-LABEL: @pow_f64
+; CHECK: <4 x double> @__svml_pow4
+; CHECK: ret
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = trunc i64 %indvars.iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %indvars.iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call fast double @__pow_finite(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %indvars.iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !51
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+!51 = distinct !{!51, !52, !53}
+!52 = !{!"llvm.loop.vectorize.width", i32 4}
+!53 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
new file mode 100644
index 00000000000..8ff62f178d0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll
@@ -0,0 +1,501 @@
+; RUN: opt -vector-library=SVML -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare double @sin(double) #0
+declare float @sinf(float) #0
+declare double @llvm.sin.f64(double) #0
+declare float @llvm.sin.f32(float) #0
+
+declare double @cos(double) #0
+declare float @cosf(float) #0
+declare double @llvm.cos.f64(double) #0
+declare float @llvm.cos.f32(float) #0
+
+declare double @pow(double, double) #0
+declare float @powf(float, float) #0
+declare double @llvm.pow.f64(double, double) #0
+declare float @llvm.pow.f32(float, float) #0
+
+declare double @exp(double) #0
+declare float @expf(float) #0
+declare double @llvm.exp.f64(double) #0
+declare float @llvm.exp.f32(float) #0
+
+declare double @log(double) #0
+declare float @logf(float) #0
+declare double @llvm.log.f64(double) #0
+declare float @llvm.log.f32(float) #0
+
+
+define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @sin(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @sinf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.sin.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.sin.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @cos(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @cosf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.cos.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.cos.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @pow(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK:    [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %arrayidx = getelementptr inbounds double, double* %exp, i64 %iv
+  %tmp1 = load double, double* %arrayidx, align 4
+  %tmp2 = tail call double @llvm.pow.f64(double %conv, double %tmp1)
+  %arrayidx2 = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %tmp2, double* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @powf(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK:    [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %arrayidx = getelementptr inbounds float, float* %exp, i64 %iv
+  %tmp1 = load float, float* %arrayidx, align 4
+  %tmp2 = tail call float @llvm.pow.f32(float %conv, float %tmp1)
+  %arrayidx2 = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %tmp2, float* %arrayidx2, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @exp(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @expf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.exp.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.exp.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @log(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @logf(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to double
+  %call = tail call double @llvm.log.f64(double %conv)
+  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
+  store double %call, double* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK:    [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %tmp = trunc i64 %iv to i32
+  %conv = sitofp i32 %tmp to float
+  %call = tail call float @llvm.log.f32(float %conv)
+  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
+  store float %call, float* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll
new file mode 100644
index 00000000000..c0bbb92c2c5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/tripcount.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -mcpu=prescott < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd11.0"
+
+@big = external global [0 x i32]
+
+; PR18049
+; We need to truncate the exit count to i32. This is legal because the
+; arithmetic is signed (%inc is nsw).
+
+; CHECK-LABEL: tripcount
+; CHECK: trunc i64 %count to i32
+
+define void @tripcount(i64 %count) {
+entry:
+  %cmp6 = icmp sgt i64 %count, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @big, i32 0, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %neg = xor i32 %0, -1
+  store i32 %neg, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.07, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %count
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
new file mode 100644
index 00000000000..e08ef002d0e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uint64_to_fp64-cost-model.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+
+; CHECK: cost of 4 for VF 1 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 5 for VF 2 For instruction:   %conv = uitofp i64 %tmp to double
+; CHECK: cost of 6 for VF 4 For instruction:   %conv = uitofp i64 %tmp to double
+define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %indvars.iv
+  %tmp = load i64, i64* %arrayidx, align 4
+  %conv = uitofp i64 %tmp to double
+  %arrayidx2 = getelementptr inbounds double, double* %b, i64 %indvars.iv
+  store double %conv, double* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
new file mode 100644
index 00000000000..2be565e7110
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
@@ -0,0 +1,99 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: test
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
+
+define void @test(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp0 = load float, float* %arrayidx, align 4
+  %add = fadd float %tmp0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; CHECK-LABEL: foo
+; CHECK-DAG: LV: Found uniform instruction:   %cond = icmp eq i64 %i.next, %n
+; CHECK-DAG: LV: Found uniform instruction:   %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0
+; CHECK-NOT: LV: Found uniform instruction:   %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+
+define void @foo(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = trunc i64 %i to i32
+  %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0
+  store i32 %tmp0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: goo
+; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop.
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
+
+define i64 @goo(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %tmp0 = load float, float* %arrayidx, align 4
+  %add = fadd float %tmp0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %retval = add i64 %indvars.iv, %indvars.iv.next
+  ret i64 %retval
+}
+
+; CHECK-LABEL: PR38786
+; Check that first order recurrence phis (%phi32 and %phi64) are not uniform.
+; CHECK-NOT: LV: Found uniform instruction:   %phi
+define void @PR38786(double* %y, double* %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %phi32 = phi i32 [ 0, %entry ], [ %i32next, %for.body ]
+  %phi64 = phi i64 [ 0, %entry ], [ %i64next, %for.body ]
+  %i32next = add i32 %phi32, 1
+  %i64next = zext i32 %i32next to i64
+  %xip = getelementptr inbounds double, double* %x, i64 %i64next
+  %yip = getelementptr inbounds double, double* %y, i64 %phi64
+  %xi = load double, double* %xip, align 8
+  store double %xi, double* %yip, align 8
+  %cmp = icmp slt i64 %i64next, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
new file mode 100644
index 00000000000..e71292265c2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_load.ll
@@ -0,0 +1,47 @@
+; RUN: opt -basicaa -loop-vectorize -S -mcpu=core-avx2 < %s | FileCheck %s
+
+;float inc = 0.5;
+;void foo(float *A, unsigned N) {
+;
+;  for (unsigned i=0; i<N; i++){
+;    A[i] += inc;
+;  }
+;}
+
+; CHECK-LABEL: foo
+; CHECK: vector.body
+; CHECK: load <8 x float>
+; CHECK: fadd <8 x float>
+; CHECK: store <8 x float>
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@inc = global float 5.000000e-01, align 4
+
+define void @foo(float* nocapture %A, i32 %N) #0 {
+entry:
+  %cmp3 = icmp eq i32 %N, 0
+  br i1 %cmp3, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %0 = load float, float* @inc, align 4
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
new file mode 100644
index 00000000000..dc23ea8efdf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -0,0 +1,23 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: "foo"
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
+entry:  
+  br label %body
+
+body:
+  %i = phi i64 [ 0, %entry ], [ %next, %body ]
+  %ptr = getelementptr inbounds i32, i32* %p, i64 %i
+  %val = load i32, i32* %ptr, align 4
+  %shift = ashr i32 %val, %k
+  store i32 %shift, i32* %ptr, align 4
+  %next = add nuw nsw i64 %i, 1
+  %cmp = icmp eq i64 %next, 16
+  br i1 %cmp, label %exit, label %body
+
+exit:
+  ret void
+
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
new file mode 100644
index 00000000000..52914b6a7c6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-pm.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -S | FileCheck %s
+; RUN: opt < %s -O2 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -disable-loop-unrolling -S | FileCheck %s -check-prefix=CHECK-NOUNRL
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+;CHECK-LABEL: @bar(
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
+;CHECK-NOUNRL-LABEL: @bar(
+;CHECK-NOUNRL: store <4 x i32>
+;CHECK-NOUNRL-NOT: store <4 x i32>
+;CHECK-NOUNRL: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
new file mode 100644
index 00000000000..69d2a319a8c
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
@@ -0,0 +1,102 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-VECTOR
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-SCALAR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; We don't unroll this loop because it has a small constant trip count.
+;
+; CHECK-VECTOR-LABEL: @foo(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo(
+; CHECK-SCALAR: load i32, i32*
+; CHECK-SCALAR-NOT: load i32, i32*
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @foo(i32* nocapture %A) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %5, label %1
+
+; <label>:5                                       ; preds = %1
+  ret i32 undef
+}
+
+; But this is a good small loop to unroll as we don't know of a bound on its
+; trip count.
+;
+; CHECK-VECTOR-LABEL: @bar(
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; For x86, loop unroll in loop vectorizer is disabled when VF==1.
+;
+; CHECK-SCALAR-LABEL: @bar(
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
+define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = add nsw i32 %3, 6
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+; Also unroll if we need a runtime check but it was going to be added for
+; vectorization anyways.
+; CHECK-VECTOR-LABEL: @runtime_chk(
+; CHECK-VECTOR: store <4 x float>
+; CHECK-VECTOR: store <4 x float>
+;
+; But not if the unrolling would introduce the runtime check.
+; CHECK-SCALAR-LABEL: @runtime_chk(
+; CHECK-SCALAR: store float
+; CHECK-SCALAR-NOT: store float
+define void @runtime_chk(float* %A, float* %B, float %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, %N
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/unroll_selection.ll b/llvm/test/Transforms/LoopVectorize/X86/unroll_selection.ll
new file mode 100644
index 00000000000..71b829071e2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/unroll_selection.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't unroll when we have register pressure.
+;CHECK: reg_pressure
+;CHECK: load <4 x double>
+;CHECK-NOT: load  <4 x double>
+;CHECK: store <4 x double>
+;CHECK-NOT: store <4 x double>
+;CHECK: ret
+define void @reg_pressure(double* nocapture %A, i32 %n) nounwind uwtable ssp {
+  %1 = sext i32 %n to i64
+  br label %2
+
+; <label>:2                                       ; preds = %2, %0
+  %indvars.iv = phi i64 [ %indvars.iv.next, %2 ], [ %1, %0 ]
+  %3 = getelementptr inbounds double, double* %A, i64 %indvars.iv
+  %4 = load double, double* %3, align 8
+  %5 = fadd double %4, 3.000000e+00
+  %6 = fmul double %4, 2.000000e+00
+  %7 = fadd double %5, %6
+  %8 = fadd double %7, 2.000000e+00
+  %9 = fmul double %8, 5.000000e-01
+  %10 = fadd double %6, %9
+  %11 = fsub double %10, %5
+  %12 = fadd double %4, %11
+  %13 = fdiv double %8, %12
+  %14 = fmul double %13, %8
+  %15 = fmul double %6, %14
+  %16 = fmul double %5, %15
+  %17 = fadd double %16, -3.000000e+00
+  %18 = fsub double %4, %5
+  %19 = fadd double %6, %18
+  %20 = fadd double %13, %19
+  %21 = fadd double %20, %17
+  %22 = fadd double %21, 3.000000e+00
+  %23 = fmul double %4, %22
+  store double %23, double* %3, align 8
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %24 = trunc i64 %indvars.iv to i32
+  %25 = icmp eq i32 %24, 0
+  br i1 %25, label %26, label %2
+
+; <label>:26                                      ; preds = %2
+  ret void
+}
+
+; This is a small loop. Unroll it twice. 
+;CHECK: small_loop
+;CHECK: xor
+;CHECK: xor
+;CHECK: ret
+define void @small_loop(i16* nocapture %A, i64 %n) nounwind uwtable ssp {
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.01 = phi i64 [ %5, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i16, i16* %A, i64 %i.01
+  %3 = load i16, i16* %2, align 2
+  %4 = xor i16 %3, 3
+  store i16 %4, i16* %2, align 2
+  %5 = add i64 %i.01, 1
+  %exitcond = icmp eq i64 %5, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
new file mode 100644
index 00000000000..6f8f5223ce4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll
@@ -0,0 +1,632 @@
+; RUN: opt < %s -vector-library=Accelerate -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK-LABEL: @sqrt_f32(
+;CHECK: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sqrtf(float) nounwind readnone
+define void @sqrt_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @exp_f32(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @expf(float) nounwind readnone
+define void @exp_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @expf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log_f32(
+;CHECK: vlogf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @logf(float) nounwind readnone
+define void @log_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @logf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; For abs instruction we'll generate vector intrinsic, as it's cheaper than a lib call.
+;CHECK-LABEL: @fabs_f32(
+;CHECK: fabs{{.*}}<4 x float>
+;CHECK: ret void
+declare float @fabsf(float) nounwind readnone
+define void @fabs_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @fabsf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we can vectorize an intrinsic into a vector call.
+;CHECK-LABEL: @exp_f32_intrin(
+;CHECK: vexpf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @llvm.exp.f32(float) nounwind readnone
+define void @exp_f32_intrin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @llvm.exp.f32(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize arbitrary functions.
+;CHECK-LABEL: @foo_f32(
+;CHECK-NOT: foo{{.*}}<4 x float>
+;CHECK: ret void
+declare float @foo(float) nounwind readnone
+define void @foo_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @foo(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test that we don't vectorize calls with nobuiltin attribute.
+;CHECK-LABEL: @sqrt_f32_nobuiltin(
+;CHECK-NOT: vsqrtf{{.*}}<4 x float>
+;CHECK: ret void
+define void @sqrt_f32_nobuiltin(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sqrtf(float %0) nounwind readnone nobuiltin
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @ceil_f32(
+;CHECK: vceilf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @ceilf(float) nounwind readnone
+define void @ceil_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @ceilf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @floor_f32(
+;CHECK: vfloorf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @floorf(float) nounwind readnone
+define void @floor_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @floorf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @expm1_f32(
+;CHECK: vexpm1f{{.*}}<4 x float>
+;CHECK: ret void
+declare float @expm1f(float) nounwind readnone
+define void @expm1_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @expm1f(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log1p_f32(
+;CHECK: vlog1pf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @log1pf(float) nounwind readnone
+define void @log1p_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @log1pf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @log10_f32(
+;CHECK: vlog10f{{.*}}<4 x float>
+;CHECK: ret void
+declare float @log10f(float) nounwind readnone
+define void @log10_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @log10f(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @logb_f32(
+;CHECK: vlogbf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @logbf(float) nounwind readnone
+define void @logb_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @logbf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @sin_f32(
+;CHECK: vsinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sinf(float) nounwind readnone
+define void @sin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sinf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @cos_f32(
+;CHECK: vcosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @cosf(float) nounwind readnone
+define void @cos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @cosf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @tan_f32(
+;CHECK: vtanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @tanf(float) nounwind readnone
+define void @tan_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @tanf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @asin_f32(
+;CHECK: vasinf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @asinf(float) nounwind readnone
+define void @asin_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @asinf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @acos_f32(
+;CHECK: vacosf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @acosf(float) nounwind readnone
+define void @acos_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @acosf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @atan_f32(
+;CHECK: vatanf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @atanf(float) nounwind readnone
+define void @atan_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @atanf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @sinh_f32(
+;CHECK: vsinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @sinhf(float) nounwind readnone
+define void @sinh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @sinhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @cosh_f32(
+;CHECK: vcoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @coshf(float) nounwind readnone
+define void @cosh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @coshf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @tanh_f32(
+;CHECK: vtanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @tanhf(float) nounwind readnone
+define void @tanh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @tanhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @asinh_f32(
+;CHECK: vasinhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @asinhf(float) nounwind readnone
+define void @asinh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @asinhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @acosh_f32(
+;CHECK: vacoshf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @acoshf(float) nounwind readnone
+define void @acosh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @acoshf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+;CHECK-LABEL: @atanh_f32(
+;CHECK: vatanhf{{.*}}<4 x float>
+;CHECK: ret void
+declare float @atanhf(float) nounwind readnone
+define void @atanh_f32(i32 %n, float* noalias %y, float* noalias %x) nounwind uwtable {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %y, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %call = tail call float @atanhf(float %0) nounwind readnone
+  %arrayidx2 = getelementptr inbounds float, float* %x, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
new file mode 100644
index 00000000000..90114672c8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
@@ -0,0 +1,87 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Loop hints: force=enabled
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; The source code for the test:
+;
+; #include <math.h>
+; void foo(float* restrict A, float * restrict B)
+; {
+;   for (int i = 0; i < 1000; i+=2) A[i] = sinf(B[i]);
+; }
+;
+
+;
+; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata.
+;
+
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.access.group !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !{!"llvm.loop.parallel_accesses", !11}}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!11 = distinct !{}
+
+;
+; This method will not be vectorized, as scalar cost is lower than any of vector costs.
+;
+
+define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare float @llvm.sin.f32(float) nounwind readnone
+
+; Dummy metadata
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!13 = distinct !{}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
new file mode 100644
index 00000000000..7c249a1b422
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S -vectorizer-min-trip-count=21 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+;
+; The source code for the test:
+;
+; void foo(float* restrict A, float* restrict B)
+; {
+;     for (int i = 0; i < 20; ++i) A[i] += B[i];
+; }
+;
+
+;
+; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
+;
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !4
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !11
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !11
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !{!"llvm.loop.parallel_accesses", !11}}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+!11 = distinct !{}
+
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed thanks to folding its tail.
+;
+define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 20
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:
+  ret void
+}
+
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!13 = distinct !{}
+
+;
+; This loop will be vectorized as the trip count is below the threshold but no
+; scalar iterations are needed.
+;
+define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) {
+; CHECK-LABEL: @vectorized2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
+; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv
+  %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13
+  %add = fadd fast float %0, %1
+  store float %add, float* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+
+!4 = !{!4}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
new file mode 100644
index 00000000000..5b3f5c58c79
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -mattr=+sse4.2 -debug-only=loop-vectorize 2>&1 -S | FileCheck %s
+; REQUIRES: asserts
+; Make sure we use the right select kind when querying select costs.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+; CHECK: Checking a loop in "scalarselect"
+define void @scalarselect(i1 %cond) {
+  br label %1
+
+; <label>:1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+
+; A scalar select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
+
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8
+  ret void
+}
+
+; CHECK: Checking a loop in "vectorselect"
+define void @vectorselect(i1 %cond) {
+  br label %1
+
+; <label>:1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %8 = icmp ult i64 %indvars.iv, 8
+
+; A vector select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
+
+  %sel = select i1 %8, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
new file mode 100644
index 00000000000..2d2082eec3a
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
@@ -0,0 +1,75 @@
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX1
+; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=core-avx2 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX2
+; REQUIRES: asserts
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = global [1000 x i8] zeroinitializer, align 16
+@b = global [1000 x i8] zeroinitializer, align 16
+@c = global [1000 x i8] zeroinitializer, align 16
+@u = global [1000 x i32] zeroinitializer, align 16
+@v = global [1000 x i32] zeroinitializer, align 16
+@w = global [1000 x i32] zeroinitializer, align 16
+
+; Tests that the vectorization factor is determined by the smallest instead of
+; widest type in the loop for maximum bandwidth when
+; -vectorizer-maximize-bandwidth is indicated.
+;
+; CHECK-LABEL: foo
+; CHECK-AVX1: LV: Selecting VF: 16.
+; CHECK-AVX2: LV: Selecting VF: 32.
+define void @foo() {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %add = add i8 %1, %0
+  %arrayidx6 = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx6, align 1
+  %arrayidx8 = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %2
+  %arrayidx13 = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 %indvars.iv
+  store i32 %add11, i32* %arrayidx13, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; We should not choose a VF larger than the constant TC.
+; VF chosen should be atmost 16 (not the max possible vector width = 32 for AVX2)
+define void @not_too_small_tc(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
+; CHECK-LABEL: not_too_small_tc
+; CHECK-AVX2: LV: Selecting VF: 16.
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %B, i64 %indvars.iv
+  %l1 = load i8, i8* %arrayidx, align 4, !llvm.access.group !13
+  %arrayidx2 = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %l2 = load i8, i8* %arrayidx2, align 4, !llvm.access.group !13
+  %add = add i8 %l1, %l2
+  store i8 %add, i8* %arrayidx2, align 4, !llvm.access.group !13
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:
+  ret void
+}
+!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}}
+!4 = !{!4}
+!13 = distinct !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
new file mode 100644
index 00000000000..cca829b9457
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll
@@ -0,0 +1,150 @@
+; RUN: opt -basicaa -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%0 = type { %0*, %1 }
+%1 = type { i8*, i32 }
+
+@p = global [2048 x [8 x i32*]] zeroinitializer, align 16
+@q = global [2048 x i16] zeroinitializer, align 16
+@r = global [2048 x i16] zeroinitializer, align 16
+
+; Tests for widest type
+; Ensure that we count the pointer store in the first test case. We have a
+; consecutive vector of pointers store, therefore we should count it towards the
+; widest vector count.
+;
+; CHECK: test_consecutive_store
+; CHECK: The Smallest and Widest types: 64 / 64 bits.
+define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 {
+  %4 = load %0*, %0** %2, align 8
+  %5 = icmp eq %0** %0, %1
+  br i1 %5, label %12, label %6
+
+; <label>:6                                       ; preds = %3
+  br label %7
+
+; <label>:7                                       ; preds = %7, %6
+  %8 = phi %0** [ %0, %6 ], [ %9, %7 ]
+  store %0* %4, %0** %8, align 8
+  %9 = getelementptr inbounds %0*, %0** %8, i64 1
+  %10 = icmp eq %0** %9, %1
+  br i1 %10, label %11, label %7
+
+; <label>:11                                      ; preds = %7
+  br label %12
+
+; <label>:12                                      ; preds = %11, %3
+  ret void
+}
+
+; However, if the store of a set of pointers is not to consecutive memory we do
+; NOT count the store towards the widest vector type.
+; In the test case below we add i16 types to store it in an array of pointer,
+; therefore the widest type should be i16.
+; int* p[2048][8];
+; short q[2048];
+;   for (int y = 0; y < 8; ++y)
+;     for (int i = 0; i < 1024; ++i) {
+;       p[i][y] = (int*) (1 + q[i]);
+;     }
+; CHECK: test_nonconsecutive_store
+; CHECK: The Smallest and Widest types: 16 / 16 bits.
+define void @test_nonconsecutive_store() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %14, %0
+  %2 = phi i64 [ 0, %0 ], [ %15, %14 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %11, %3 ]
+  %5 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q, i64 0, i64 %4
+  %6 = load i16, i16* %5, align 2
+  %7 = sext i16 %6 to i64
+  %8 = add i64 %7, 1
+  %9 = inttoptr i64 %8 to i32*
+  %10 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p, i64 0, i64 %4, i64 %2
+  store i32* %9, i32** %10, align 8
+  %11 = add i64 %4, 1
+  %12 = trunc i64 %11 to i32
+  %13 = icmp ne i32 %12, 1024
+  br i1 %13, label %3, label %14
+
+; <label>:14                                      ; preds = %3
+  %15 = add i64 %2, 1
+  %16 = trunc i64 %15 to i32
+  %17 = icmp ne i32 %16, 8
+  br i1 %17, label %1, label %18
+
+; <label>:18                                      ; preds = %14
+  ret void
+}
+
+
+@ia = global [1024 x i32*] zeroinitializer, align 16
+@ib = global [1024 x i32] zeroinitializer, align 16
+@ic = global [1024 x i8] zeroinitializer, align 16
+@p2 = global [2048 x [8 x i32*]] zeroinitializer, align 16
+@q2 = global [2048 x i16] zeroinitializer, align 16
+
+;; Now we check the same rules for loads. We should take consecutive loads of
+;; pointer types into account.
+; CHECK: test_consecutive_ptr_load
+; CHECK: The Smallest and Widest types: 8 / 64 bits.
+define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %2 = phi i64 [ 0, %0 ], [ %10, %1 ]
+  %3 = phi i8 [ 0, %0 ], [ %9, %1 ]
+  %4 = getelementptr inbounds [1024 x i32*], [1024 x i32*]* @ia, i32 0, i64 %2
+  %5 = load i32*, i32** %4, align 4
+  %6 = ptrtoint i32* %5 to i64
+  %7 = trunc i64 %6 to i8
+  %8 = add i8 %3, 1
+  %9 = add i8 %7, %8
+  %10 = add i64 %2, 1
+  %11 = icmp ne i64 %10, 1024
+  br i1 %11, label %1, label %12
+
+; <label>:12                                      ; preds = %1
+  %13 = phi i8 [ %9, %1 ]
+  ret i8 %13
+}
+
+;; However, we should not take unconsecutive loads of pointers into account.
+; CHECK: test_nonconsecutive_ptr_load
+; CHECK: LV: The Smallest and Widest types: 16 / 16 bits.
+define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable {
+  br label %1
+
+; <label>:1                                       ; preds = %13, %0
+  %2 = phi i64 [ 0, %0 ], [ %14, %13 ]
+  br label %3
+
+; <label>:3                                       ; preds = %3, %1
+  %4 = phi i64 [ 0, %1 ], [ %10, %3 ]
+  %5 = getelementptr inbounds [2048 x [8 x i32*]], [2048 x [8 x i32*]]* @p2, i64 0, i64 %4, i64 %2
+  %6 = getelementptr inbounds [2048 x i16], [2048 x i16]* @q2, i64 0, i64 %4
+  %7 = load i32*, i32** %5, align 2
+  %8 = ptrtoint i32* %7 to i64
+  %9 = trunc i64 %8 to i16
+  store i16 %9, i16* %6, align 8
+  %10 = add i64 %4, 1
+  %11 = trunc i64 %10 to i32
+  %12 = icmp ne i32 %11, 1024
+  br i1 %12, label %3, label %13
+
+; <label>:13                                      ; preds = %3
+  %14 = add i64 %2, 1
+  %15 = trunc i64 %14 to i32
+  %16 = icmp ne i32 %15, 8
+  br i1 %16, label %1, label %17
+
+; <label>:17                                      ; preds = %13
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
new file mode 100644
index 00000000000..c38b638ed2b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
+
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
+; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, i32* %diff, align 4, !tbaa !11
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* %cb, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1, !tbaa !21
+  %conv = sext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [16 x i8], [16 x i8]* %cc, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1, !tbaa !21
+  %conv3 = sext i8 %1 to i32
+  %sub = sub i32 %conv, %conv3
+  %add = add nsw i32 %sub, %add8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !25
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, i32* %diff, align 4, !tbaa !11
+  call void @ibar(i32* %diff) #2
+  ret i32 0
+}
+
+declare void @ibar(i32*) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)
+!25 = !{!25, !15}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
new file mode 100644
index 00000000000..4aa96df94ff
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -0,0 +1,313 @@
+; RUN: opt < %s -loop-vectorize -transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -transform-warning -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize'  2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,transform-warning -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
+
+; C/C++ code for tests
+; void test(int *A, int Length) {
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < Length; i++) {
+;     A[i] = i;
+;     if (A[i] > Length)
+;       break;
+;   }
+; }
+; File, line, and column should match those specified in the metadata
+; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:4:5: loop not vectorized
+
+; void test_disabled(int *A, int Length) {
+; #pragma clang loop vectorize(disable) interleave(disable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = i;
+; }
+; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+; void test_array_bounds(int *A, int *B, int Length) {
+; #pragma clang loop vectorize(enable)
+;   for (int i = 0; i < Length; i++)
+;     A[i] = A[B[i]];
+; }
+; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
+; CHECK: remark: source.cpp:19:5: loop not vectorized
+; CHECK: warning: source.cpp:19:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+
+; int foo();
+; void test_multiple_failures(int *A) {
+;   int k = 0;
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < 1000; i+=A[i]) {
+;     if (A[i])
+;       k = foo();
+;   }
+;   return k;
+; }
+; CHECK: remark: source.cpp:29:7: loop not vectorized: control flow cannot be substituted for a select
+; CHECK: remark: source.cpp:27:3: loop not vectorized
+
+; YAML:       --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            AllDisabled
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 13, Column: 5 }
+; YAML-NEXT: Function:        _Z13test_disabledPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            ''
+; YAML-NEXT: Name:            CantIdentifyArrayBounds
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          cannot identify array bounds
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT:   - String:          ' (Force='
+; YAML-NEXT:   - Force:           'true'
+; YAML-NEXT:   - String:          ')'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Failure
+; YAML-NEXT: Pass:            transform-warning
+; YAML-NEXT: Name:            FailedRequestedVectorization
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NoCFGForSelect
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 29, Column: 7 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          control flow cannot be substituted for a select
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NonReductionValueUsedOutsideLoop
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          value that could not be identified as reduction is used outside the loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z4testPii(i32* nocapture %A, i32 %Length) #0 !dbg !4 {
+entry:
+  %cmp10 = icmp sgt i32 %Length, 0, !dbg !12
+  br i1 %cmp10, label %for.body, label %for.end, !dbg !12, !llvm.loop !14
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !16
+  %0 = trunc i64 %indvars.iv to i32, !dbg !16
+  %ld = load i32, i32* %arrayidx, align 4
+  store i32 %0, i32* %arrayidx, align 4, !dbg !16, !tbaa !18
+  %cmp3 = icmp sle i32 %ld, %Length, !dbg !22
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !12
+  %1 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %1, %Length, !dbg !12
+  %or.cond = and i1 %cmp3, %cmp, !dbg !22
+  br i1 %or.cond, label %for.body, label %for.end, !dbg !22
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !24
+}
+
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 !dbg !7 {
+entry:
+  %cmp4 = icmp sgt i32 %Length, 0, !dbg !25
+  br i1 %cmp4, label %for.body, label %for.end, !dbg !25, !llvm.loop !27
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !30
+  %0 = trunc i64 %indvars.iv to i32, !dbg !30
+  store i32 %0, i32* %arrayidx, align 4, !dbg !30, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !25
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !25
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !25
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !25, !llvm.loop !27
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !31
+}
+
+; CHECK: _Z13test_disabledPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind optsize ssp uwtable
+define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 !dbg !8 {
+entry:
+  %cmp9 = icmp sgt i32 %Length, 0, !dbg !32
+  br i1 %cmp9, label %for.body.preheader, label %for.end, !dbg !32, !llvm.loop !34
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !35
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv, !dbg !35
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !35, !tbaa !18
+  %idxprom1 = sext i32 %0 to i64, !dbg !35
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1, !dbg !35
+  %1 = load i32, i32* %arrayidx2, align 4, !dbg !35, !tbaa !18
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv, !dbg !35
+  store i32 %1, i32* %arrayidx4, align 4, !dbg !35, !tbaa !18
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !32
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !32
+  %exitcond = icmp eq i32 %lftr.wideiv, %Length, !dbg !32
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !32, !llvm.loop !34
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !36
+}
+
+; CHECK: _Z17test_array_boundsPiS_i
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind uwtable
+define i32 @test_multiple_failures(i32* nocapture readonly %A) #0 !dbg !46 {
+entry:
+  br label %for.body, !dbg !38
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.09 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+  %k.09 = phi i32 [ 0, %entry ], [ %k.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09, !dbg !40
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !40
+  %tobool = icmp eq i32 %0, 0, !dbg !40
+  br i1 %tobool, label %for.inc, label %if.then, !dbg !40
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (...) @foo(), !dbg !41
+  %.pre = load i32, i32* %arrayidx, align 4
+  br label %for.inc, !dbg !42
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %1 = phi i32 [ %.pre, %if.then ], [ 0, %for.body ], !dbg !43
+  %k.1 = phi i32 [ %call, %if.then ], [ %k.09, %for.body ]
+  %add = add nsw i32 %1, %i.09, !dbg !44
+  %cmp = icmp slt i32 %add, 1000, !dbg !45
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !38
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret i32 %k.1, !dbg !39
+}
+
+declare i32 @foo(...)
+
+; CHECK: test_multiple_failure
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0", isOptimized: true, runtimeVersion: 6, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "source.cpp", directory: ".")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "test", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "source.cpp", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "test_disabled", line: 10, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 10, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!8 = distinct !DISubprogram(name: "test_array_bounds", line: 16, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 16, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 2}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.5.0"}
+!12 = !DILocation(line: 3, column: 8, scope: !13)
+!13 = distinct !DILexicalBlock(line: 3, column: 3, file: !1, scope: !4)
+!14 = !{!14, !15, !15}
+!15 = !{!"llvm.loop.vectorize.enable", i1 true}
+!16 = !DILocation(line: 4, column: 5, scope: !17)
+!17 = distinct !DILexicalBlock(line: 3, column: 36, file: !1, scope: !13)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 5, column: 9, scope: !23)
+!23 = distinct !DILexicalBlock(line: 5, column: 9, file: !1, scope: !17)
+!24 = !DILocation(line: 8, column: 1, scope: !4)
+!25 = !DILocation(line: 12, column: 8, scope: !26)
+!26 = distinct !DILexicalBlock(line: 12, column: 3, file: !1, scope: !7)
+!27 = !{!27, !28, !29}
+!28 = !{!"llvm.loop.interleave.count", i32 1}
+!29 = !{!"llvm.loop.vectorize.width", i32 1}
+!30 = !DILocation(line: 13, column: 5, scope: !26)
+!31 = !DILocation(line: 14, column: 1, scope: !7)
+!32 = !DILocation(line: 18, column: 8, scope: !33)
+!33 = distinct !DILexicalBlock(line: 18, column: 3, file: !1, scope: !8)
+!34 = !{!34, !15}
+!35 = !DILocation(line: 19, column: 5, scope: !33)
+!36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, retainedNodes: !2)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
new file mode 100644
index 00000000000..ac0648a6838
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+
+; Verify analysis remarks are generated when interleaving is not beneficial.
+; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that vectorization is not beneficial
+; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that interleaving is not beneficial and is explicitly disabled or interleave count is set to 1
+; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that vectorization is not beneficial
+; CHECK: remark: vectorization-remarks-profitable.c:12:17: the cost-model indicates that interleaving is not beneficial
+
+; First loop.
+;  #pragma clang loop interleave(disable) unroll(disable)
+;  for(int i = 0; i < n; i++) {
+;    out[i] = *in[i];
+;  }
+
+; Second loop.
+;  #pragma clang loop unroll(disable)
+;  for(int i = 0; i < n; i++) {
+;    out[i] = *in[i];
+;  }
+
+; ModuleID = 'vectorization-remarks-profitable.ll'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Function Attrs: nounwind uwtable
+define void @do_not_interleave(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !4 {
+entry:
+  %cmp.4 = icmp eq i32 %size, 0, !dbg !10
+  br i1 %cmp.4, label %for.end, label %for.body.preheader, !dbg !11
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !12
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !12
+  %0 = bitcast float** %arrayidx to i32**, !dbg !12
+  %1 = load i32*, i32** %0, align 8, !dbg !12
+  %2 = load i32, i32* %1, align 4, !dbg !13
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !14
+  %3 = bitcast float* %arrayidx2 to i32*, !dbg !15
+  store i32 %2, i32* %3, align 4, !dbg !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !11
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !11
+  %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !11
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !dbg !11, !llvm.loop !16
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !19
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !19
+}
+
+; Function Attrs: nounwind uwtable
+define void @interleave_not_profitable(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !6 {
+entry:
+  %cmp.4 = icmp eq i32 %size, 0, !dbg !20
+  br i1 %cmp.4, label %for.end, label %for.body, !dbg !21
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float*, float** %in, i64 %indvars.iv, !dbg !22
+  %0 = bitcast float** %arrayidx to i32**, !dbg !22
+  %1 = load i32*, i32** %0, align 8, !dbg !22
+  %2 = load i32, i32* %1, align 4, !dbg !23
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %indvars.iv, !dbg !24
+  %3 = bitcast float* %arrayidx2 to i32*, !dbg !25
+  store i32 %2, i32* %3, align 4, !dbg !25
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !21
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !21
+  %exitcond = icmp eq i32 %lftr.wideiv, %size, !dbg !21
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !21, !llvm.loop !26
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void, !dbg !27
+}
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250016)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "vectorization-remarks-profitable.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "do_not_interleave", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = distinct !DISubprogram(name: "interleave_not_profitable", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, retainedNodes: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.8.0 (trunk 250016)"}
+!10 = !DILocation(line: 4, column: 23, scope: !4)
+!11 = !DILocation(line: 4, column: 3, scope: !4)
+!12 = !DILocation(line: 5, column: 17, scope: !4)
+!13 = !DILocation(line: 5, column: 16, scope: !4)
+!14 = !DILocation(line: 5, column: 7, scope: !4)
+!15 = !DILocation(line: 5, column: 14, scope: !4)
+!16 = distinct !{!16, !17, !18}
+!17 = !{!"llvm.loop.interleave.count", i32 1}
+!18 = !{!"llvm.loop.unroll.disable"}
+!19 = !DILocation(line: 6, column: 1, scope: !4)
+!20 = !DILocation(line: 11, column: 23, scope: !6)
+!21 = !DILocation(line: 11, column: 3, scope: !6)
+!22 = !DILocation(line: 12, column: 17, scope: !6)
+!23 = !DILocation(line: 12, column: 16, scope: !6)
+!24 = !DILocation(line: 12, column: 7, scope: !6)
+!25 = !DILocation(line: 12, column: 14, scope: !6)
+!26 = distinct !{!26, !18}
+!27 = !DILocation(line: 13, column: 1, scope: !6)
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
new file mode 100644
index 00000000000..0345a3ba045
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll
@@ -0,0 +1,73 @@
+; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
+
+; RUN: llc < %s -mtriple x86_64-pc-linux-gnu -o - | FileCheck -check-prefix=DEBUG-OUTPUT %s
+; DEBUG-OUTPUT-NOT: .loc
+; DEBUG-OUTPUT-NOT: {{.*}}.debug_info
+
+; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1)
+; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4)
+; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @foo(i32 %n) #0 !dbg !4 {
+entry:
+  %diff = alloca i32, align 4
+  %cb = alloca [16 x i8], align 16
+  %cc = alloca [16 x i8], align 16
+  store i32 0, i32* %diff, align 4, !dbg !10, !tbaa !11
+  br label %for.body, !dbg !15
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %add8 = phi i32 [ 0, %entry ], [ %add, %for.body ], !dbg !19
+  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* %cb, i64 0, i64 %indvars.iv, !dbg !19
+  %0 = load i8, i8* %arrayidx, align 1, !dbg !19, !tbaa !21
+  %conv = sext i8 %0 to i32, !dbg !19
+  %arrayidx2 = getelementptr inbounds [16 x i8], [16 x i8]* %cc, i64 0, i64 %indvars.iv, !dbg !19
+  %1 = load i8, i8* %arrayidx2, align 1, !dbg !19, !tbaa !21
+  %conv3 = sext i8 %1 to i32, !dbg !19
+  %sub = sub i32 %conv, %conv3, !dbg !19
+  %add = add nsw i32 %sub, %add8, !dbg !19
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !15
+  %exitcond = icmp eq i64 %indvars.iv.next, 16, !dbg !15
+  br i1 %exitcond, label %for.end, label %for.body, !dbg !15
+
+for.end:                                          ; preds = %for.body
+  store i32 %add, i32* %diff, align 4, !dbg !19, !tbaa !11
+  call void @ibar(i32* %diff) #2, !dbg !22
+  ret i32 0, !dbg !23
+}
+
+declare void @ibar(i32*) #1
+
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+!llvm.dbg.cu = !{!24}
+
+!1 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "foo", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !24, scopeLine: 6, file: !1, scope: !5, type: !6, retainedNodes: !2)
+!5 = !DIFile(filename: "vectorization-remarks.c", directory: ".")
+!6 = !DISubroutineType(types: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.5.0 "}
+!10 = !DILocation(line: 8, column: 3, scope: !4)
+!11 = !{!12, !12, i64 0}
+!12 = !{!"int", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !DILocation(line: 17, column: 8, scope: !16)
+!16 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !17)
+!17 = distinct !DILexicalBlock(line: 17, column: 8, file: !1, scope: !18)
+!18 = distinct !DILexicalBlock(line: 17, column: 3, file: !1, scope: !4)
+!19 = !DILocation(line: 18, column: 5, scope: !20)
+!20 = distinct !DILexicalBlock(line: 17, column: 27, file: !1, scope: !18)
+!21 = !{!13, !13, i64 0}
+!22 = !DILocation(line: 20, column: 3, scope: !4)
+!23 = !DILocation(line: 21, column: 3, scope: !4)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !1, emissionKind: NoDebug)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
new file mode 100644
index 00000000000..d1473552c98
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-only-for-real.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -basicaa -loop-vectorize < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+define i32 @accum(i32* nocapture readonly %x, i32 %N) #0 {
+entry:
+; CHECK-LABEL: @accum
+; CHECK-NOT: x i32>
+
+  %cmp1 = icmp sgt i32 %N, 0
+  br i1 %cmp1, label %for.inc.preheader, label %for.end
+
+for.inc.preheader:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %for.inc.preheader ]
+  %sum.02 = phi i32 [ %add, %for.inc ], [ 0, %for.inc.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %sum.02
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %N
+  br i1 %exitcond, label %for.end.loopexit, label %for.inc
+
+for.end.loopexit:
+  %add.lcssa = phi i32 [ %add, %for.inc ]
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
+  ret i32 %sum.0.lcssa
+
+; CHECK: ret i32
+}
+
+attributes #0 = { "target-cpu"="core2" "target-features"="+sse,-avx,-avx2,-sse2" }
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
new file mode 100644
index 00000000000..0b23d6286f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -0,0 +1,826 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=DISABLED_MASKED_STRIDED 
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -simplifycfg -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses  -enable-masked-interleaved-mem-accesses < %s | FileCheck %s -check-prefix=ENABLED_MASKED_STRIDED 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; When masked-interleaved-groups are disabled:
+; Check that the predicated load is not vectorized as an
+; interleaved-group but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+; In this case a scalar epilogue is not needed.
+;
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load for an interleave-group (with
+; a single member).
+; Since the last (second) member of the load-group is a gap, peeling is used,
+; so we also expect to find a scalar epilogue loop.
+;
+; void masked_strided1(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %[[STRIDEDVEC:.+]] = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED: for.body:
+
+define dso_local void @masked_strided1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Exactly the same scenario except we are now optimizing for size, therefore
+; we check that no scalar epilogue is created. Since we can't create an epilog
+; we need the ability to mask out the gaps.
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads with the mask properly shuffled and
+; And-ed with the gaps mask.
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+;ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP0]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP0]])
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @masked_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+; Accesses with gaps under Optsize scenario again, with unknown trip-count
+; this time, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality checks; So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided1_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+; DISABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP0]], [[TMP2]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP3]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP5]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP6]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> undef, i8 [[TMP7]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[TMP6]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.010, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Same, with stride 3. This is to check the gaps-mask and the shuffled mask
+; with a different stride.
+; So accesses are with gaps under Optsize scenario again, with unknown trip-
+; count, in order to check the behavior of folding-the-tail (folding the
+; remainder loop into the main loop using masking) together with interleaved-
+; groups.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+; The shuffled mask is also And-ed with the gaps mask.
+;
+; void masked_strided3_optsize_unknown_tc(const unsigned char* restrict p,
+;                      unsigned char* restrict q,
+;                      unsigned char guard,
+;                      int n) {
+;   for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char t = p[3*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided3_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD:%.*]] to i32
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[CONV]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[INDEX]], 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <24 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0v24i8(<24 x i8>* [[TMP5]], i32 1, <24 x i1> [[TMP6]], <24 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP8]], i32 1, <8 x i1> [[TMP4]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+;
+define dso_local void @masked_strided3_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.010, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = mul nsw i32 %ix.010, 3
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.010
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Back to stride 2 with gaps with a known trip count under opt for size,
+; but this time the load/store are not predicated. 
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks because we have gaps and we can't
+; create an epilog. The access is thus scalarized.
+; (Before the fix that this test checks, we used to create an epilogue despite
+; optsize, and vectorized the access as an interleaved-group. This is now fixed,
+; and we make sure that a scalar epilogue does not exist).
+; When enable-masked-interleaved-access is enabled, the interleave-groups will
+; be vectorized with masked wide-loads (masking away the gaps).
+;
+; void unconditional_strided1_optsize(const unsigned char* restrict p,
+;                                unsigned char* restrict q,
+;                                unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+; }
+
+;DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NOT: call <16 x i8> @llvm.masked.load.v16i8.p0v16i8
+;DISABLED_MASKED_STRIDED:     %{{.*}} = extractelement <8 x i32> %{{.*}}, i32 0       
+;DISABLED_MASKED_STRIDED-NOT: for.body:
+;DISABLED_MASKED_STRIDED:     for.end:
+
+;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize(
+;ENABLED_MASKED_STRIDED-NEXT:  entry:
+;ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+;ENABLED_MASKED_STRIDED:       vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP2]], i32 1, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>*
+;ENABLED_MASKED_STRIDED-NEXT:    store <8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP4]], align 1
+;ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+;ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+;ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]]
+;ENABLED_MASKED_STRIDED-NOT:   for.body:
+;ENABLED_MASKED_STRIDED:       for.end:
+;ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+define dso_local void @unconditional_strided1_optsize(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr optsize {
+entry:
+  br label %for.body
+
+for.body:
+  %ix.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = shl nuw nsw i32 %ix.06, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.06
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.06, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+
+
+; Unconditioal accesses with gaps under Optsize scenario again, with unknown
+; trip-count this time, in order to check the behavior of folding-the-tail 
+; (folding the remainder loop into the main loop using masking) together with
+; interleaved-groups. Folding-the-tail turns the accesses to conditional which
+; requires proper masking. In addition we need to mask out the gaps (all
+; because we are not allowed to use an epilog due to optsize).
+; When enable-masked-interleaved-access is disabled, the interleave-groups will
+; be invalidated during cost-model checks. So there we check for no epilogue
+; and for scalarized conditional accesses.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The shuffled mask is also And-ed with the gaps mask.
+;
+;   for(ix=0; ix < n; ++ix) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;   }
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISBLED_MASKED_STRIDED-NOT:    for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize_unknown_tc(
+; ENABLED_MASKED_STRIDED-NEXT:  entry:
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.ph:
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[N]], -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> undef, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> [[INDUCTION]], [[BROADCAST_SPLAT2]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <16 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[TMP4]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[INDEX]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <8 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8>* [[TMP6]], i32 1, <8 x i1> [[TMP2]])
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP7]], label [[FOR_END]], label [[VECTOR_BODY]]
+; ENABLED_MASKED_STRIDED-NOT:   for.body:
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_strided1_optsize_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.07, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %q, i32 %ix.07
+  store i8 %0, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i32 %ix.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+; Check also a scenario with full interleave-groups (no gaps) as well as both
+; load and store groups. We check that when masked-interleave-group is disabled
+; the predicated loads (and stores) are not vectorized as an
+; interleaved-group but rather as four separate scalarized accesses.
+; (For SKX, gather/scatter is not supported by the compiler for chars, therefore
+; the only remaining alternative is to scalarize).
+; When  masked-interleave-group is enabled we expect to find the proper mask
+; shuffling code, feeding the wide masked load/store for the two interleave-
+; groups.
+;
+; void masked_strided2(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+; for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+;DISABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;DISABLED_MASKED_STRIDED: vector.body:
+;DISABLED_MASKED_STRIDED-NEXT:  %index = phi i32 
+;DISABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;DISABLED_MASKED_STRIDED:        %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;DISABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;DISABLED_MASKED_STRIDED-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;DISABLED_MASKED_STRIDED-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;DISABLED_MASKED_STRIDED-NOT:   %interleaved.mask =
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.load.
+;DISABLED_MASKED_STRIDED-NOT:   call void @llvm.masked.store.
+;DISABLED_MASKED_STRIDED-NOT:   %{{.*}} = shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+;ENABLED_MASKED_STRIDED-LABEL: @masked_strided2(
+;ENABLED_MASKED_STRIDED: vector.body:
+;ENABLED_MASKED_STRIDED-NEXT:  %index = phi i32
+;ENABLED_MASKED_STRIDED-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;ENABLED_MASKED_STRIDED:       %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;ENABLED_MASKED_STRIDED:       %interleaved.mask = shufflevector <8 x i1> %[[VMASK]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+;ENABLED_MASKED_STRIDED-NEXT:  %[[WIDEMASKEDLOAD:.+]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask, <16 x i8> undef)
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+;ENABLED_MASKED_STRIDED-NEXT:  %{{.*}} = shufflevector <16 x i8> %[[WIDEMASKEDLOAD]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+;ENABLED_MASKED_STRIDED:       call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %interleaved.mask)
+
+; Function Attrs: norecurse nounwind
+define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr  {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.024, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.024, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx4 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx4, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx6 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx6, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx11 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx11, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.024, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; Full groups again, this time checking an Optsize scenario, with unknown trip-
+; count, to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during Legality check, so nothing to check here.
+; When masked-interleave-group is enabled we check that there is no epilogue,
+; and that the interleave-groups are vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; The mask itself is an And of two masks: one that masks away the remainder
+; iterations, and one that masks away the 'else' of the 'if' statement.
+;
+; void masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;     if (ix > guard) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+;     }
+; }
+;}
+
+; ENABLED_MASKED_STRIDED-LABEL: @masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP1]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = and <8 x i1> [[TMP0]], [[TMP3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = or i32 [[TMP1]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add <8 x i32> {{.*}}, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = icmp eq i32 {{.*}}, {{.*}} 
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP13]], 
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %guard, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp22 = icmp sgt i32 %n, 0
+  br i1 %cmp22, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.023 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %ix.023, %guard
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.023, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx3, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx5 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx5, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx9 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx9, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.023, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Full groups under Optsize scenario again, with unknown trip-count, again in
+; order to check the behavior of folding-the-tail (folding the remainder loop
+; into the main loop using masking) together with interleaved-groups.
+; This time the accesses are not conditional, they become conditional only
+; due to tail folding.
+; When masked-interleave-group is disabled the interleave-groups will be
+; invalidated during cost-model checks, so we check for no epilogue and
+; scalarized conditional accesses.
+; When masked-interleave-group is enabled we check for no epilogue,
+; and interleave-groups vectorized using proper masking (with
+; shuffling of the mask feeding the wide masked load/store).
+; (Same vectorization scheme as for the previous loop with conditional accesses
+; except here the mask only masks away the remainder iterations.)
+;
+; void unconditional_masked_strided2_unknown_tc(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     int n) {
+; for(ix=0; ix < n; ++ix) {
+;         char left = p[2*ix];
+;         char right = p[2*ix + 1];
+;         char max = max(left, right);
+;         q[2*ix] = max;
+;         q[2*ix+1] = 0 - max;
+; }
+;}
+
+; DISABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; DISABLED_MASKED_STRIDED:       vector.body:
+; DISABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; DISABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i32> [[VEC_IND]], {{.*}}
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; DISABLED_MASKED_STRIDED:       pred.load.if:
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP0]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP3]]
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
+; DISABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> undef, i8 [[TMP5]], i32 0
+; DISABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; DISABLED_MASKED_STRIDED-NOT:   for.body:
+; DISABLED_MASKED_STRIDED:       for.end:
+; DISABLED_MASKED_STRIDED-NEXT:    ret void
+
+
+
+; ENABLED_MASKED_STRIDED-LABEL: @unconditional_masked_strided2_unknown_tc(
+; ENABLED_MASKED_STRIDED:       vector.body:
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[INDEX]], i32 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[INDEX]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[P:%.*]], i32 [[TMP0]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i32> {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
+; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> undef)
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = or i32 [[TMP0]], 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
+; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
+; ENABLED_MASKED_STRIDED-NEXT:    {{.*}} = add i32 [[INDEX]], 8
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = icmp eq i32 {{.*}}, {{.*}}
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP11]]
+; ENABLED_MASKED_STRIDED:       for.end:
+; ENABLED_MASKED_STRIDED-NEXT:    ret void
+
+define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %n) local_unnamed_addr optsize {
+entry:
+  %cmp20 = icmp sgt i32 %n, 0
+  br i1 %cmp20, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %ix.021 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %mul = shl nuw nsw i32 %ix.021, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = or i32 %mul, 1
+  %arrayidx2 = getelementptr inbounds i8, i8* %p, i32 %add
+  %1 = load i8, i8* %arrayidx2, align 1
+  %cmp.i = icmp slt i8 %0, %1
+  %spec.select.i = select i1 %cmp.i, i8 %1, i8 %0
+  %arrayidx4 = getelementptr inbounds i8, i8* %q, i32 %mul
+  store i8 %spec.select.i, i8* %arrayidx4, align 1
+  %sub = sub i8 0, %spec.select.i
+  %arrayidx8 = getelementptr inbounds i8, i8* %q, i32 %add
+  store i8 %sub, i8* %arrayidx8, align 1
+  %inc = add nuw nsw i32 %ix.021, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
new file mode 100644
index 00000000000..7d5526fe391
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-pr39099.ll
@@ -0,0 +1,60 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses < %s | FileCheck %s 
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
+; This test checks the fix for PR39099.
+;
+; Check that the predicated load is not vectorized as an
+; interleaved-group (which requires proper masking, currently unsupported)
+; but rather as a scalarized accesses.
+; (For SKX, Gather is not supported by the compiler for chars, therefore
+;  the only remaining alternative is to scalarize).
+;
+; void masked_strided(const unsigned char* restrict p,
+;                     unsigned char* restrict q,
+;                     unsigned char guard) {
+;   for(ix=0; ix < 1024; ++ix) {
+;     if (ix > guard) {
+;         char t = p[2*ix];
+;         q[ix] = t;
+;     }
+;   }
+; }
+
+;CHECK-LABEL: @masked_strided(
+;CHECK: vector.body:
+;CHECK-NEXT:  %index = phi i32 
+;CHECK-NEXT:  %[[VECIND:.+]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+;CHECK-NEXT:  %[[VMASK:.+]] = icmp ugt <8 x i32> %[[VECIND]], %{{broadcast.splat*}}
+;CHECK-NEXT:  %{{.*}} = shl nuw nsw <8 x i32> %[[VECIND]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+;CHECK-NEXT:  %[[M:.+]] = extractelement <8 x i1> %[[VMASK]], i32 0
+;CHECK-NEXT:  br i1 %[[M]], label %pred.load.if, label %pred.load.continue
+;CHECK-NOT:   %[[WIDEVEC:.+]] = load <16 x i8>, <16 x i8>* %{{.*}}, align 1
+;CHECK-NOT:   %{{.*}} = shufflevector <16 x i8> %[[WIDEVEC]], <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+
+define dso_local void @masked_strided(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr {
+entry:
+  %conv = zext i8 %guard to i32
+  br label %for.body
+
+for.body:
+  %ix.09 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ugt i32 %ix.09, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %mul = shl nuw nsw i32 %ix.09, 1
+  %arrayidx = getelementptr inbounds i8, i8* %p, i32 %mul
+  %0 = load i8, i8* %arrayidx, align 1
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i32 %ix.09
+  store i8 %0, i8* %arrayidx3, align 1
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %ix.09, 1
+  %exitcond = icmp eq i32 %inc, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
new file mode 100644
index 00000000000..45d843c9d58
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -0,0 +1,98 @@
+; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
+; RUN: opt -mcpu=skylake-avx512 -S -force-vector-width=8 -force-vector-interleave=1 -loop-vectorize < %s | FileCheck %s --check-prefix=SINK-GATHER
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: predicated_sdiv_masked_load
+;
+; This test ensures that we don't scalarize the predicated load. Since the load
+; can be vectorized with predication, scalarizing it would cause its pointer
+; operand to become non-uniform.
+;
+; CHECK: vector.body:
+; CHECK:   %wide.masked.load = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32
+; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
+; CHECK: [[IF0]]:
+; CHECK:   %[[T0:.+]] = extractelement <2 x i32> %wide.masked.load, i32 0
+; CHECK:   %[[T1:.+]] = sdiv i32 %[[T0]], %x
+; CHECK:   %[[T2:.+]] = insertelement <2 x i32> undef, i32 %[[T1]], i32 0
+; CHECK:   br label %[[CONT0]]
+; CHECK: [[CONT0]]:
+; CHECK:   %[[T3:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[T2]], %[[IF0]] ]
+; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
+; CHECK: [[IF1]]:
+; CHECK:   %[[T4:.+]] = extractelement <2 x i32> %wide.masked.load, i32 1
+; CHECK:   %[[T5:.+]] = sdiv i32 %[[T4]], %x
+; CHECK:   %[[T6:.+]] = insertelement <2 x i32> %[[T3]], i32 %[[T5]], i32 1
+; CHECK:   br label %[[CONT1]]
+; CHECK: [[CONT1]]:
+; CHECK:   phi <2 x i32> [ %[[T3]], %[[CONT0]] ], [ %[[T6]], %[[IF1]] ]
+; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
+
+define i32 @predicated_sdiv_masked_load(i32* %a, i32* %b, i32 %x, i1 %c) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp7, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp2 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp3 = load i32, i32* %tmp2, align 4
+  %tmp4 = sdiv i32 %tmp3, %x
+  %tmp5 = add nsw i32 %tmp4, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp6 = phi i32 [ %tmp1, %for.body ], [ %tmp5, %if.then]
+  %tmp7 = add i32 %r, %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 10000
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp8 = phi i32 [ %tmp7, %for.inc ]
+  ret i32 %tmp8
+}
+
+; This test ensures that a load, which would have been widened otherwise is
+; instead scalarized if Cost-Model so decided as part of its
+; sink-scalar-operands optimization for predicated instructions.
+;
+; SINK-GATHER: vector.body:
+; SINK-GATHER: pred.udiv.if:
+; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
+; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
+; SINK-GATHER: pred.udiv.continue:
+define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %i7 = mul i64 %i, 777
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i7
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp4 = udiv i32 %tmp2, %x
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %x, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
new file mode 100644
index 00000000000..0debb338486
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -0,0 +1,29 @@
+; RUN: opt -O3 -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@x = common global [1024 x x86_fp80] zeroinitializer, align 16
+
+;CHECK-LABEL: @example(
+;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
+;CHECK: store
+;CHECK: ret void
+
+define void @example() nounwind ssp uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sitofp i32 1 to x86_fp80
+  %arrayidx = getelementptr inbounds [1024 x x86_fp80], [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
+  store x86_fp80 %conv, x86_fp80* %arrayidx, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
author	Eric Christopher <echristo@gmail.com>	2019-04-17 04:52:47 +0000
committer	Eric Christopher <echristo@gmail.com>	2019-04-17 04:52:47 +0000
commit	cee313d288a4faf0355d76fb6e0e927e211d08a5 (patch)
tree	d386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/LoopVectorize/X86
parent	c3d6a929fdd92fd06d4304675ade8d7210ee711a (diff)
download	bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip