Revert "Temporarily Revert "Add basic loop fusion pass.""

The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552
author: Eric Christopher <echristo@gmail.com> 2019-04-17 04:52:47 +0000
committer: Eric Christopher <echristo@gmail.com> 2019-04-17 04:52:47 +0000
commit: cee313d288a4faf0355d76fb6e0e927e211d08a5 (patch)
tree: d386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/SLPVectorizer
parent: c3d6a929fdd92fd06d4304675ade8d7210ee711a (diff)
download: bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz
bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip
214 files changed, 59315 insertions, 0 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
new file mode 100644
index 00000000000..ad970b2bec1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; Currently disabled for a few subtargets (e.g. Kryo):
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s
+
+define void @f(float* %r, float* %w) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0
+; CHECK-NEXT:    [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[R0]] to <2 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
+; CHECK-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[W0]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+; NO_SLP-LABEL: @f(
+; NO_SLP-NEXT:    [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0
+; NO_SLP-NEXT:    [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1
+; NO_SLP-NEXT:    [[F0:%.*]] = load float, float* [[R0]]
+; NO_SLP-NEXT:    [[F1:%.*]] = load float, float* [[R1]]
+; NO_SLP-NEXT:    [[ADD0:%.*]] = fadd float [[F0]], [[F0]]
+; NO_SLP-NEXT:    [[ADD1:%.*]] = fadd float [[F1]], [[F1]]
+; NO_SLP-NEXT:    [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
+; NO_SLP-NEXT:    [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
+; NO_SLP-NEXT:    store float [[ADD0]], float* [[W0]]
+; NO_SLP-NEXT:    store float [[ADD1]], float* [[W1]]
+; NO_SLP-NEXT:    ret void
+;
+  %r0 = getelementptr inbounds float, float* %r, i64 0
+  %r1 = getelementptr inbounds float, float* %r, i64 1
+  %f0 = load float, float* %r0
+  %f1 = load float, float* %r1
+  %add0 = fadd float %f0, %f0
+  %add1 = fadd float %f1, %f1
+  %w0 = getelementptr inbounds float, float* %w, i64 0
+  %w1 = getelementptr inbounds float, float* %w, i64 1
+  store float %add0, float* %w0
+  store float %add1, float* %w1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
new file mode 100644
index 00000000000..1a981a32804
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=aarch64-apple-ios -mcpu=cyclone -o - %s | FileCheck %s
+
+define void @f1(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP1]], i16* [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2
+; CHECK-NEXT:    ret void
+;
+  %t2 = extractelement <2 x i16> %x, i32 0
+  %t3 = extractelement <2 x i16> %x, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t2, i16* %a
+  store i16 %t2, i16* %ptr0
+  store i16 %t3, i16* %ptr1
+  store i16 %t3, i16* %ptr2
+  store i16 %t2, i16* %ptr3
+  ret void
+}
+
+define void @f2(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[CONT:%.*]]
+; CHECK:       cont:
+; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %cont
+
+cont:                                           ; preds = %entry, %cont
+  %xx = phi <2 x i16> [ %x, %entry ], [ undef, %cont ]
+  %aa = phi i16* [ %a, %entry ], [ undef, %cont ]
+  %t2 = extractelement <2 x i16> %xx, i32 0
+  %t3 = extractelement <2 x i16> %xx, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t2, i16* %a
+  store i16 %t2, i16* %ptr0
+  store i16 %t3, i16* %ptr1
+  store i16 %t3, i16* %ptr2
+  store i16 %t2, i16* %ptr3
+  %a_val = load i16, i16* %a, align 2
+  %cmp = icmp eq i16 %a_val, 0
+  br i1 %cmp, label %cont, label %exit
+
+exit:                                           ; preds = %cont
+  ret void
+}
+
+define void @f3(<2 x i16> %x, i16* %a) {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[CONT:%.*]]
+; CHECK:       cont:
+; CHECK-NEXT:    [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ]
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    store i16 [[TMP0]], i16* [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>*
+; CHECK-NEXT:    store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i16, i16* [[A]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %cont
+
+cont:                                           ; preds = %entry, %cont
+  %xx = phi <2 x i16> [ %x, %entry ], [ undef, %cont ]
+  %aa = phi i16* [ %a, %entry ], [ undef, %cont ]
+  %t2 = extractelement <2 x i16> %xx, i32 0
+  %t3 = extractelement <2 x i16> %xx, i32 1
+  %ptr0 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0
+  %ptr1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1
+  %ptr2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2
+  %ptr3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3
+  store i16 %t3, i16* %a
+  store i16 %t3, i16* %ptr0
+  store i16 %t2, i16* %ptr1
+  store i16 %t2, i16* %ptr2
+  store i16 %t3, i16* %ptr3
+  %a_val = load i16, i16* %a, align 2
+  %cmp = icmp eq i16 %a_val, 0
+  br i1 %cmp, label %cont, label %exit
+
+exit:                                           ; preds = %cont
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
new file mode 100644
index 00000000000..3f831fd2cc7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer %s -slp-threshold=-10 | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%structA = type { [2 x float] }
+
+define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
+; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
+; CHECK:       for.end27:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %conv5 = sitofp i32 %ymin to float
+  %conv = sitofp i32 %xmin to float
+  %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
+  %0 = load float, float* %arrayidx4, align 4
+  %sub = fsub fast float %conv, %0
+  %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
+  %1 = load float, float* %arrayidx9, align 4
+  %sub10 = fsub fast float %conv5, %1
+  %mul11 = fmul fast float %sub, %sub
+  %mul12 = fmul fast float %sub10, %sub10
+  %add = fadd fast float %mul11, %mul12
+  %cmp = fcmp oeq float %add, 0.000000e+00
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
+
+for.end27:
+  ret void
+}
+
+define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
+; CHECK:       for.body3.lr.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
+; CHECK:       for.end27:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %conv5 = sitofp i32 %ymin to float
+  %conv = sitofp i32 %xmin to float
+  %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0
+  %0 = load float, float* %arrayidx4, align 4
+  %sub = fsub fast float %conv, %0
+  %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1
+  %1 = load float, float* %arrayidx9, align 4
+  %sub10 = fsub fast float %conv5, %1
+  %mul11 = fmul fast float %sub, %sub
+  %mul12 = fmul fast float %sub10, %sub10
+  %add = fadd fast float %mul12, %mul11         ;;;<---- Operands commuted!!
+  %cmp = fcmp oeq float %add, 0.000000e+00
+  br i1 %cmp, label %for.body3.lr.ph, label %for.end27
+
+for.end27:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
new file mode 100644
index 00000000000..8e36a921c75
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu < %s | FileCheck %s
+
+target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
+
+declare void @foo(i64, i64, i64, i64)
+
+define void @test1(<4 x i16> %a, <4 x i16> %b, i64* %p) {
+; Make sure types of sub and its sources are not extended.
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
+; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
+; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
+; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
+; CHECK-NEXT:    [[S1:%.*]] = sext i32 [[E1]] to i64
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
+; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
+; CHECK-NEXT:    [[S2:%.*]] = sext i32 [[E2]] to i64
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
+; CHECK-NEXT:    [[S3:%.*]] = sext i32 [[E3]] to i64
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S3]]
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
+; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %z0 = zext <4 x i16> %a to <4 x i32>
+  %z1 = zext <4 x i16> %b to <4 x i32>
+  %sub0 = sub <4 x i32> %z0, %z1
+  %e0 = extractelement <4 x i32> %sub0, i32 0
+  %s0 = sext i32 %e0 to i64
+  %gep0 = getelementptr inbounds i64, i64* %p, i64 %s0
+  %load0 = load i64, i64* %gep0
+  %e1 = extractelement <4 x i32> %sub0, i32 1
+  %s1 = sext i32 %e1 to i64
+  %gep1 = getelementptr inbounds i64, i64* %p, i64 %s1
+  %load1 = load i64, i64* %gep1
+  %e2 = extractelement <4 x i32> %sub0, i32 2
+  %s2 = sext i32 %e2 to i64
+  %gep2 = getelementptr inbounds i64, i64* %p, i64 %s2
+  %load2 = load i64, i64* %gep2
+  %e3 = extractelement <4 x i32> %sub0, i32 3
+  %s3 = sext i32 %e3 to i64
+  %gep3 = getelementptr inbounds i64, i64* %p, i64 %s3
+  %load3 = load i64, i64* %gep3
+  call void @foo(i64 %load0, i64 %load1, i64 %load2, i64 %load3)
+  ret void
+}
+
+define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c3, i64* %p) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[C0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, i64* [[GEP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP8]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, i64* [[GEP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP9]]
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i64, i64* [[GEP3]]
+; CHECK-NEXT:    call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %z0 = zext <4 x i16> %a to <4 x i32>
+  %z1 = zext <4 x i16> %b to <4 x i32>
+  %sub0 = sub <4 x i32> %z0, %z1
+  %e0 = extractelement <4 x i32> %sub0, i32 0
+  %s0 = sext i32 %e0 to i64
+  %a0 = add i64 %s0, %c0
+  %gep0 = getelementptr inbounds i64, i64* %p, i64 %a0
+  %load0 = load i64, i64* %gep0
+  %e1 = extractelement <4 x i32> %sub0, i32 1
+  %s1 = sext i32 %e1 to i64
+  %a1 = add i64 %s1, %c1
+  %gep1 = getelementptr inbounds i64, i64* %p, i64 %a1
+  %load1 = load i64, i64* %gep1
+  %e2 = extractelement <4 x i32> %sub0, i32 2
+  %s2 = sext i32 %e2 to i64
+  %a2 = add i64 %s2, %c2
+  %gep2 = getelementptr inbounds i64, i64* %p, i64 %a2
+  %load2 = load i64, i64* %gep2
+  %e3 = extractelement <4 x i32> %sub0, i32 3
+  %s3 = sext i32 %e3 to i64
+  %a3 = add i64 %s3, %c3
+  %gep3 = getelementptr inbounds i64, i64* %p, i64 %a3
+  %load3 = load i64, i64* %gep3
+  call void @foo(i64 %load0, i64 %load1, i64 %load2, i64 %load3)
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
new file mode 100644
index 00000000000..14a6d0eb72c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
+; RUN: opt < %s -S -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; REMARK-LABEL: Function: gather_multiple_use
+; REMARK:       Args:
+; REMARK-NEXT:    - String: 'Vectorized horizontal reduction with cost '
+; REMARK-NEXT:    - Cost: '-7'
+;
+define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @gather_multiple_use(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]])
+; CHECK-NEXT:    ret i32 [[TMP10]]
+;
+  %tmp00 = lshr i32 %a, 15
+  %tmp01 = and i32 %tmp00, 65537
+  %tmp02 = mul nuw i32 %tmp01, 65535
+  %tmp03 = add i32 %tmp02, %a
+  %tmp04 = xor i32 %tmp03, %tmp02
+  %tmp05 = lshr i32 %c, 15
+  %tmp06 = and i32 %tmp05, 65537
+  %tmp07 = mul nuw i32 %tmp06, 65535
+  %tmp08 = add i32 %tmp07, %c
+  %tmp09 = xor i32 %tmp08, %tmp07
+  %tmp10 = lshr i32 %b, 15
+  %tmp11 = and i32 %tmp10, 65537
+  %tmp12 = mul nuw i32 %tmp11, 65535
+  %tmp13 = add i32 %tmp12, %b
+  %tmp14 = xor i32 %tmp13, %tmp12
+  %tmp15 = lshr i32 %d, 15
+  %tmp16 = and i32 %tmp15, 65537
+  %tmp17 = mul nuw i32 %tmp16, 65535
+  %tmp18 = add i32 %tmp17, %d
+  %tmp19 = xor i32 %tmp18, %tmp17
+  %tmp20 = add i32 %tmp09, %tmp04
+  %tmp21 = add i32 %tmp20, %tmp14
+  %tmp22 = add i32 %tmp21, %tmp19
+  ret i32 %tmp22
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
new file mode 100644
index 00000000000..30acee4b25c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -0,0 +1,543 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=GENERIC
+; RUN: opt -S -mcpu=kryo -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=KRYO
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These tests check that we vectorize the index calculations in the
+; gather-reduce pattern shown below. We check cases having i32 and i64
+; subtraction.
+;
+; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
+;   int sum = 0;
+;   for (int i = 0; i < n ; ++i) {
+;     sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
+;     sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
+;     sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
+;     sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
+;   }
+;   return sum;
+; }
+
+define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+; GENERIC-LABEL: @gather_reduce_8x16_i32(
+; GENERIC-NEXT:  entry:
+; GENERIC-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; GENERIC-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; GENERIC:       for.body.preheader:
+; GENERIC-NEXT:    br label [[FOR_BODY:%.*]]
+; GENERIC:       for.cond.cleanup.loopexit:
+; GENERIC-NEXT:    br label [[FOR_COND_CLEANUP]]
+; GENERIC:       for.cond.cleanup:
+; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; GENERIC:       for.body:
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; KRYO-LABEL: @gather_reduce_8x16_i32(
+; KRYO-NEXT:  entry:
+; KRYO-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; KRYO-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; KRYO:       for.body.preheader:
+; KRYO-NEXT:    br label [[FOR_BODY:%.*]]
+; KRYO:       for.cond.cleanup.loopexit:
+; KRYO-NEXT:    br label [[FOR_COND_CLEANUP]]
+; KRYO:       for.cond.cleanup:
+; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; KRYO:       for.body:
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i32
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
+  %1 = load i16, i16* %b, align 2
+  %conv2 = zext i16 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i32
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i32
+  %sub8 = sub nsw i32 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i32
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i32
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i32
+  %sub26 = sub nsw i32 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i32
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i32
+  %sub35 = sub nsw i32 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i32
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i32
+  %sub44 = sub nsw i32 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i32
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i32
+  %sub53 = sub nsw i32 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i32
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i32
+  %sub62 = sub nsw i32 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+; GENERIC-LABEL: @gather_reduce_8x16_i64(
+; GENERIC-NEXT:  entry:
+; GENERIC-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; GENERIC-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; GENERIC:       for.body.preheader:
+; GENERIC-NEXT:    br label [[FOR_BODY:%.*]]
+; GENERIC:       for.cond.cleanup.loopexit:
+; GENERIC-NEXT:    br label [[FOR_COND_CLEANUP]]
+; GENERIC:       for.cond.cleanup:
+; GENERIC-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; GENERIC-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; GENERIC:       for.body:
+; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; GENERIC-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; GENERIC-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; GENERIC-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; GENERIC-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; GENERIC-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; GENERIC-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; GENERIC-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; GENERIC-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; GENERIC-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; GENERIC-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; GENERIC-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; GENERIC-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; GENERIC-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; GENERIC-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; GENERIC-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; GENERIC-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; GENERIC-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; GENERIC-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; GENERIC-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; GENERIC-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; GENERIC-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; GENERIC-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; GENERIC-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; GENERIC-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; GENERIC-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; GENERIC-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; GENERIC-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; GENERIC-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; GENERIC-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; GENERIC-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; GENERIC-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; GENERIC-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; GENERIC-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; GENERIC-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; GENERIC-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; GENERIC-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; GENERIC-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; GENERIC-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; GENERIC-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; GENERIC-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; GENERIC-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; GENERIC-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; GENERIC-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; GENERIC-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; GENERIC-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; GENERIC-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; GENERIC-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+; KRYO-LABEL: @gather_reduce_8x16_i64(
+; KRYO-NEXT:  entry:
+; KRYO-NEXT:    [[CMP_99:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; KRYO-NEXT:    br i1 [[CMP_99]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; KRYO:       for.body.preheader:
+; KRYO-NEXT:    br label [[FOR_BODY:%.*]]
+; KRYO:       for.cond.cleanup.loopexit:
+; KRYO-NEXT:    br label [[FOR_COND_CLEANUP]]
+; KRYO:       for.cond.cleanup:
+; KRYO-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD66:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; KRYO-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; KRYO:       for.body:
+; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
+; KRYO-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
+; KRYO-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; KRYO-NEXT:    [[TMP3:%.*]] = bitcast i16* [[B:%.*]] to <8 x i16>*
+; KRYO-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 2
+; KRYO-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i32>
+; KRYO-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; KRYO-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP6]], i32 0
+; KRYO-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; KRYO-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[G:%.*]], i64 [[TMP8]]
+; KRYO-NEXT:    [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; KRYO-NEXT:    [[CONV3:%.*]] = zext i16 [[TMP9]] to i32
+; KRYO-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUM_0102]], [[CONV3]]
+; KRYO-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP6]], i32 1
+; KRYO-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; KRYO-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP11]]
+; KRYO-NEXT:    [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; KRYO-NEXT:    [[CONV11:%.*]] = zext i16 [[TMP12]] to i32
+; KRYO-NEXT:    [[ADD12:%.*]] = add nsw i32 [[ADD]], [[CONV11]]
+; KRYO-NEXT:    [[TMP13:%.*]] = extractelement <8 x i32> [[TMP6]], i32 2
+; KRYO-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; KRYO-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP14]]
+; KRYO-NEXT:    [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX19]], align 2
+; KRYO-NEXT:    [[CONV20:%.*]] = zext i16 [[TMP15]] to i32
+; KRYO-NEXT:    [[ADD21:%.*]] = add nsw i32 [[ADD12]], [[CONV20]]
+; KRYO-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP6]], i32 3
+; KRYO-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; KRYO-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP17]]
+; KRYO-NEXT:    [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX28]], align 2
+; KRYO-NEXT:    [[CONV29:%.*]] = zext i16 [[TMP18]] to i32
+; KRYO-NEXT:    [[ADD30:%.*]] = add nsw i32 [[ADD21]], [[CONV29]]
+; KRYO-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP6]], i32 4
+; KRYO-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
+; KRYO-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP20]]
+; KRYO-NEXT:    [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX37]], align 2
+; KRYO-NEXT:    [[CONV38:%.*]] = zext i16 [[TMP21]] to i32
+; KRYO-NEXT:    [[ADD39:%.*]] = add nsw i32 [[ADD30]], [[CONV38]]
+; KRYO-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP6]], i32 5
+; KRYO-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; KRYO-NEXT:    [[ARRAYIDX46:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP23]]
+; KRYO-NEXT:    [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX46]], align 2
+; KRYO-NEXT:    [[CONV47:%.*]] = zext i16 [[TMP24]] to i32
+; KRYO-NEXT:    [[ADD48:%.*]] = add nsw i32 [[ADD39]], [[CONV47]]
+; KRYO-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP6]], i32 6
+; KRYO-NEXT:    [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
+; KRYO-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP26]]
+; KRYO-NEXT:    [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
+; KRYO-NEXT:    [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
+; KRYO-NEXT:    [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i32 7
+; KRYO-NEXT:    [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
+; KRYO-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
+; KRYO-NEXT:    [[TMP30:%.*]] = load i16, i16* [[ARRAYIDX64]], align 2
+; KRYO-NEXT:    [[CONV65:%.*]] = zext i16 [[TMP30]] to i32
+; KRYO-NEXT:    [[ADD66]] = add nsw i32 [[ADD57]], [[CONV65]]
+; KRYO-NEXT:    [[INC]] = add nuw nsw i32 [[I_0103]], 1
+; KRYO-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; KRYO-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i64
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
+  %1 = load i16, i16* %b, align 2
+  %conv2 = zext i16 %1 to i64
+  %sub = sub nsw i64 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i64
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i64
+  %sub8 = sub nsw i64 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i64
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i64
+  %sub17 = sub nsw i64 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i64
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i64
+  %sub26 = sub nsw i64 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i64
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i64
+  %sub35 = sub nsw i64 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i64
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i64
+  %sub44 = sub nsw i64 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i64
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i64
+  %sub53 = sub nsw i64 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i64
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i64
+  %sub62 = sub nsw i64 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
new file mode 100644
index 00000000000..a142d4e0355
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
+; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
+; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+@a = common global [80 x i8] zeroinitializer, align 16
+
+define void @PR28330(i32 %n) {
+; DEFAULT-LABEL: @PR28330(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; DEFAULT-NEXT:    [[P20:%.*]] = add i32 [[P17]], undef
+; DEFAULT-NEXT:    [[P22:%.*]] = add i32 [[P20]], undef
+; DEFAULT-NEXT:    [[P24:%.*]] = add i32 [[P22]], undef
+; DEFAULT-NEXT:    [[P26:%.*]] = add i32 [[P24]], undef
+; DEFAULT-NEXT:    [[P28:%.*]] = add i32 [[P26]], undef
+; DEFAULT-NEXT:    [[P30:%.*]] = add i32 [[P28]], undef
+; DEFAULT-NEXT:    [[P32:%.*]] = add i32 [[P30]], undef
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]]
+; DEFAULT-NEXT:    [[P34:%.*]] = add i32 [[P32]], undef
+; DEFAULT-NEXT:    br label [[FOR_BODY]]
+;
+; GATHER-LABEL: @PR28330(
+; GATHER-NEXT:  entry:
+; GATHER-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
+; GATHER:       for.body:
+; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
+; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7
+; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
+; GATHER-NEXT:    [[P20:%.*]] = add i32 [[P17]], [[TMP19]]
+; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
+; GATHER-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[TMP20]]
+; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
+; GATHER-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[TMP21]]
+; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
+; GATHER-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[TMP22]]
+; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
+; GATHER-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[TMP23]]
+; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
+; GATHER-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[TMP24]]
+; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
+; GATHER-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[TMP25]]
+; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
+; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
+; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
+; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
+; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
+; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
+; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
+; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], [[P17]]
+; GATHER-NEXT:    [[P34:%.*]] = add i32 [[P32]], [[TMP33]]
+; GATHER-NEXT:    br label [[FOR_BODY]]
+;
+; MAX-COST-LABEL: @PR28330(
+; MAX-COST-NEXT:  entry:
+; MAX-COST-NEXT:    [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+; MAX-COST-NEXT:    [[P1:%.*]] = icmp eq i8 [[P0]], 0
+; MAX-COST-NEXT:    [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+; MAX-COST-NEXT:    [[P3:%.*]] = icmp eq i8 [[P2]], 0
+; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
+; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
+; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
+; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
+; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
+; MAX-COST:       for.body:
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P20:%.*]] = add i32 [[P17]], [[P19]]
+; MAX-COST-NEXT:    [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[P21]]
+; MAX-COST-NEXT:    [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[P23]]
+; MAX-COST-NEXT:    [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[P25]]
+; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
+; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
+; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[P31]]
+; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    br label [[FOR_BODY]]
+;
+entry:
+  %p0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+  %p1 = icmp eq i8 %p0, 0
+  %p2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+  %p3 = icmp eq i8 %p2, 0
+  %p4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+  %p5 = icmp eq i8 %p4, 0
+  %p6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+  %p7 = icmp eq i8 %p6, 0
+  %p8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+  %p9 = icmp eq i8 %p8, 0
+  %p10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+  %p11 = icmp eq i8 %p10, 0
+  %p12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+  %p13 = icmp eq i8 %p12, 0
+  %p14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+  %p15 = icmp eq i8 %p14, 0
+  br label %for.body
+
+for.body:
+  %p17 = phi i32 [ %p34, %for.body ], [ 0, %entry ]
+  %p19 = select i1 %p1, i32 -720, i32 -80
+  %p20 = add i32 %p17, %p19
+  %p21 = select i1 %p3, i32 -720, i32 -80
+  %p22 = add i32 %p20, %p21
+  %p23 = select i1 %p5, i32 -720, i32 -80
+  %p24 = add i32 %p22, %p23
+  %p25 = select i1 %p7, i32 -720, i32 -80
+  %p26 = add i32 %p24, %p25
+  %p27 = select i1 %p9, i32 -720, i32 -80
+  %p28 = add i32 %p26, %p27
+  %p29 = select i1 %p11, i32 -720, i32 -80
+  %p30 = add i32 %p28, %p29
+  %p31 = select i1 %p13, i32 -720, i32 -80
+  %p32 = add i32 %p30, %p31
+  %p33 = select i1 %p15, i32 -720, i32 -80
+  %p34 = add i32 %p32, %p33
+  br label %for.body
+}
+
+define void @PR32038(i32 %n) {
+; DEFAULT-LABEL: @PR32038(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; DEFAULT-NEXT:    [[P20:%.*]] = add i32 -5, undef
+; DEFAULT-NEXT:    [[P22:%.*]] = add i32 [[P20]], undef
+; DEFAULT-NEXT:    [[P24:%.*]] = add i32 [[P22]], undef
+; DEFAULT-NEXT:    [[P26:%.*]] = add i32 [[P24]], undef
+; DEFAULT-NEXT:    [[P28:%.*]] = add i32 [[P26]], undef
+; DEFAULT-NEXT:    [[P30:%.*]] = add i32 [[P28]], undef
+; DEFAULT-NEXT:    [[P32:%.*]] = add i32 [[P30]], undef
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]])
+; DEFAULT-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT:    [[P34:%.*]] = add i32 [[P32]], undef
+; DEFAULT-NEXT:    br label [[FOR_BODY]]
+;
+; GATHER-LABEL: @PR32038(
+; GATHER-NEXT:  entry:
+; GATHER-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
+; GATHER:       for.body:
+; GATHER-NEXT:    [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i1> undef, i1 [[TMP2]], i32 0
+; GATHER-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i1> [[TMP5]], i1 [[TMP6]], i32 2
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
+; GATHER-NEXT:    [[TMP9:%.*]] = insertelement <8 x i1> [[TMP7]], i1 [[TMP8]], i32 3
+; GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
+; GATHER-NEXT:    [[TMP11:%.*]] = insertelement <8 x i1> [[TMP9]], i1 [[TMP10]], i32 4
+; GATHER-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
+; GATHER-NEXT:    [[TMP13:%.*]] = insertelement <8 x i1> [[TMP11]], i1 [[TMP12]], i32 5
+; GATHER-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
+; GATHER-NEXT:    [[TMP15:%.*]] = insertelement <8 x i1> [[TMP13]], i1 [[TMP14]], i32 6
+; GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
+; GATHER-NEXT:    [[TMP17:%.*]] = insertelement <8 x i1> [[TMP15]], i1 [[TMP16]], i32 7
+; GATHER-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; GATHER-NEXT:    [[TMP19:%.*]] = extractelement <8 x i32> [[TMP18]], i32 0
+; GATHER-NEXT:    [[P20:%.*]] = add i32 -5, [[TMP19]]
+; GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP18]], i32 1
+; GATHER-NEXT:    [[P22:%.*]] = add i32 [[P20]], [[TMP20]]
+; GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[TMP18]], i32 2
+; GATHER-NEXT:    [[P24:%.*]] = add i32 [[P22]], [[TMP21]]
+; GATHER-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP18]], i32 3
+; GATHER-NEXT:    [[P26:%.*]] = add i32 [[P24]], [[TMP22]]
+; GATHER-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP18]], i32 4
+; GATHER-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[TMP23]]
+; GATHER-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP18]], i32 5
+; GATHER-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[TMP24]]
+; GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP18]], i32 6
+; GATHER-NEXT:    [[P32:%.*]] = add i32 [[P30]], [[TMP25]]
+; GATHER-NEXT:    [[TMP26:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP27:%.*]] = insertelement <8 x i32> [[TMP26]], i32 [[TMP20]], i32 1
+; GATHER-NEXT:    [[TMP28:%.*]] = insertelement <8 x i32> [[TMP27]], i32 [[TMP21]], i32 2
+; GATHER-NEXT:    [[TMP29:%.*]] = insertelement <8 x i32> [[TMP28]], i32 [[TMP22]], i32 3
+; GATHER-NEXT:    [[TMP30:%.*]] = insertelement <8 x i32> [[TMP29]], i32 [[TMP23]], i32 4
+; GATHER-NEXT:    [[TMP31:%.*]] = insertelement <8 x i32> [[TMP30]], i32 [[TMP24]], i32 5
+; GATHER-NEXT:    [[TMP32:%.*]] = insertelement <8 x i32> [[TMP31]], i32 [[TMP25]], i32 6
+; GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i32> [[TMP18]], i32 7
+; GATHER-NEXT:    [[TMP34:%.*]] = insertelement <8 x i32> [[TMP32]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[TMP35:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP34]])
+; GATHER-NEXT:    [[OP_EXTRA]] = add i32 [[TMP35]], -5
+; GATHER-NEXT:    [[P34:%.*]] = add i32 [[P32]], [[TMP33]]
+; GATHER-NEXT:    br label [[FOR_BODY]]
+;
+; MAX-COST-LABEL: @PR32038(
+; MAX-COST-NEXT:  entry:
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
+; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
+; MAX-COST-NEXT:    [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+; MAX-COST-NEXT:    [[P5:%.*]] = icmp eq i8 [[P4]], 0
+; MAX-COST-NEXT:    [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+; MAX-COST-NEXT:    [[P7:%.*]] = icmp eq i8 [[P6]], 0
+; MAX-COST-NEXT:    [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; MAX-COST-NEXT:    [[P9:%.*]] = icmp eq i8 [[P8]], 0
+; MAX-COST-NEXT:    [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; MAX-COST-NEXT:    [[P11:%.*]] = icmp eq i8 [[P10]], 0
+; MAX-COST-NEXT:    [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; MAX-COST-NEXT:    [[P13:%.*]] = icmp eq i8 [[P12]], 0
+; MAX-COST-NEXT:    [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; MAX-COST-NEXT:    [[P15:%.*]] = icmp eq i8 [[P14]], 0
+; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
+; MAX-COST:       for.body:
+; MAX-COST-NEXT:    [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
+; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
+; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2
+; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3
+; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[P20:%.*]] = add i32 -5, undef
+; MAX-COST-NEXT:    [[P22:%.*]] = add i32 [[P20]], undef
+; MAX-COST-NEXT:    [[P24:%.*]] = add i32 [[P22]], undef
+; MAX-COST-NEXT:    [[P26:%.*]] = add i32 [[P24]], undef
+; MAX-COST-NEXT:    [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P28:%.*]] = add i32 [[P26]], [[P27]]
+; MAX-COST-NEXT:    [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]])
+; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]]
+; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]]
+; MAX-COST-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5
+; MAX-COST-NEXT:    [[P30:%.*]] = add i32 [[P28]], [[P29]]
+; MAX-COST-NEXT:    [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
+; MAX-COST-NEXT:    [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
+; MAX-COST-NEXT:    [[P34]] = add i32 [[P32]], [[P33]]
+; MAX-COST-NEXT:    br label [[FOR_BODY]]
+;
+entry:
+  %p0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+  %p1 = icmp eq i8 %p0, 0
+  %p2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+  %p3 = icmp eq i8 %p2, 0
+  %p4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+  %p5 = icmp eq i8 %p4, 0
+  %p6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+  %p7 = icmp eq i8 %p6, 0
+  %p8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+  %p9 = icmp eq i8 %p8, 0
+  %p10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+  %p11 = icmp eq i8 %p10, 0
+  %p12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+  %p13 = icmp eq i8 %p12, 0
+  %p14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+  %p15 = icmp eq i8 %p14, 0
+  br label %for.body
+
+for.body:
+  %p17 = phi i32 [ %p34, %for.body ], [ 0, %entry ]
+  %p19 = select i1 %p1, i32 -720, i32 -80
+  %p20 = add i32 -5, %p19
+  %p21 = select i1 %p3, i32 -720, i32 -80
+  %p22 = add i32 %p20, %p21
+  %p23 = select i1 %p5, i32 -720, i32 -80
+  %p24 = add i32 %p22, %p23
+  %p25 = select i1 %p7, i32 -720, i32 -80
+  %p26 = add i32 %p24, %p25
+  %p27 = select i1 %p9, i32 -720, i32 -80
+  %p28 = add i32 %p26, %p27
+  %p29 = select i1 %p11, i32 -720, i32 -80
+  %p30 = add i32 %p28, %p29
+  %p31 = select i1 %p13, i32 -720, i32 -80
+  %p32 = add i32 %p30, %p31
+  %p33 = select i1 %p15, i32 -720, i32 -80
+  %p34 = add i32 %p32, %p33
+  br label %for.body
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
new file mode 100644
index 00000000000..d3bbf3df858
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
+; RUN: opt -S -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-18 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
+
+
+target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These tests check that we remove from consideration pairs of seed
+; getelementptrs when they are known to have a constant difference. Such pairs
+; are likely not good candidates for vectorization since one can be computed
+; from the other. We use an unprofitable threshold to force vectorization.
+;
+; int getelementptr(int *g, int n, int w, int x, int y, int z) {
+;   int sum = 0;
+;   for (int i = 0; i < n ; ++i) {
+;     sum += g[2*i + w]; sum += g[2*i + x];
+;     sum += g[2*i + y]; sum += g[2*i + z];
+;   }
+;   return sum;
+; }
+;
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_4x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '16'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
+define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @getelementptr_4x32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, i32 [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP13]]
+; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP15]]
+; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
+; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
+  %t4 = shl nsw i32 %indvars.iv, 1
+  %t5 = add nsw i32 %t4, 0
+  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
+  %t6 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %t6, %sum.032
+  %t7 = add nsw i32 %t4, %x
+  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
+  %t8 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add1, %t8
+  %t9 = add nsw i32 %t4, %y
+  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
+  %t10 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add6, %t10
+  %t11 = add nsw i32 %t4, %z
+  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
+  %t12 = load i32, i32* %arrayidx15, align 4
+  %add16 = add nsw i32 %add11, %t12
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next , %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '5'
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Function:        getelementptr_2x32
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+
+define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @getelementptr_2x32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[T4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP6]]
+; CHECK-NEXT:    [[T7:%.*]] = or i32 [[T4]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[T7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP7]]
+; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP12]]
+; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
+; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
+  %t4 = shl nsw i32 %indvars.iv, 1
+  %t5 = add nsw i32 %t4, 0
+  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
+  %t6 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %t6, %sum.032
+  %t7 = add nsw i32 %t4, 1
+  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
+  %t8 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add1, %t8
+  %t9 = add nsw i32 %t4, %y
+  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
+  %t10 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add6, %t10
+  %t11 = add nsw i32 %t4, %z
+  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
+  %t12 = load i32, i32* %arrayidx15, align 4
+  %add16 = add nsw i32 %add11, %t12
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next , %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
new file mode 100644
index 00000000000..c4a584fcbf3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=YAML %s
+
+
+; FIXME: The threshold is changed to keep this test case a bit smaller.
+; The AArch64 cost model should not give such high costs to select statements.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux"
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-8'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '8'
+
+define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
+; CHECK-LABEL: @test_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_22:%.*]] = icmp sgt i32 [[H:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_22]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[S_026:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[J_025:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[P2_024:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR29:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[P1_023:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[S_026]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD]], undef
+; CHECK-NEXT:    [[ADD19:%.*]] = add nsw i32 [[ADD11]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]]
+; CHECK-NEXT:    [[ADD27:%.*]] = add nsw i32 [[ADD19]], undef
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_025]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[H]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[S_0_LCSSA]]
+;
+entry:
+  %cmp.22 = icmp sgt i32 %h, 0
+  br i1 %cmp.22, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %idx.ext = sext i32 %lx to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %s.026 = phi i32 [ 0, %for.body.lr.ph ], [ %add27, %for.body ]
+  %j.025 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %p2.024 = phi i32* [ %blk2, %for.body.lr.ph ], [ %add.ptr29, %for.body ]
+  %p1.023 = phi i32* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %for.body ]
+  %0 = load i32, i32* %p1.023, align 4
+  %1 = load i32, i32* %p2.024, align 4
+  %sub = sub nsw i32 %0, %1
+  %cmp2 = icmp slt i32 %sub, 0
+  %sub3 = sub nsw i32 0, %sub
+  %sub3.sub = select i1 %cmp2, i32 %sub3, i32 %sub
+  %add = add nsw i32 %sub3.sub, %s.026
+  %arrayidx4 = getelementptr inbounds i32, i32* %p1.023, i64 1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %p2.024, i64 1
+  %3 = load i32, i32* %arrayidx5, align 4
+  %sub6 = sub nsw i32 %2, %3
+  %cmp7 = icmp slt i32 %sub6, 0
+  %sub9 = sub nsw i32 0, %sub6
+  %v.1 = select i1 %cmp7, i32 %sub9, i32 %sub6
+  %add11 = add nsw i32 %add, %v.1
+  %arrayidx12 = getelementptr inbounds i32, i32* %p1.023, i64 2
+  %4 = load i32, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %p2.024, i64 2
+  %5 = load i32, i32* %arrayidx13, align 4
+  %sub14 = sub nsw i32 %4, %5
+  %cmp15 = icmp slt i32 %sub14, 0
+  %sub17 = sub nsw i32 0, %sub14
+  %sub17.sub14 = select i1 %cmp15, i32 %sub17, i32 %sub14
+  %add19 = add nsw i32 %add11, %sub17.sub14
+  %arrayidx20 = getelementptr inbounds i32, i32* %p1.023, i64 3
+  %6 = load i32, i32* %arrayidx20, align 4
+  %arrayidx21 = getelementptr inbounds i32, i32* %p2.024, i64 3
+  %7 = load i32, i32* %arrayidx21, align 4
+  %sub22 = sub nsw i32 %6, %7
+  %cmp23 = icmp slt i32 %sub22, 0
+  %sub25 = sub nsw i32 0, %sub22
+  %v.3 = select i1 %cmp23, i32 %sub25, i32 %sub22
+  %add27 = add nsw i32 %add19, %v.3
+  %add.ptr = getelementptr inbounds i32, i32* %p1.023, i64 %idx.ext
+  %add.ptr29 = getelementptr inbounds i32, i32* %p2.024, i64 %idx.ext
+  %inc = add nuw nsw i32 %j.025, 1
+  %exitcond = icmp eq i32 %inc, %h
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %s.0.lcssa = phi i32 [ 0, %entry ], [ %add27, %for.end.loopexit ]
+  ret i32 %s.0.lcssa
+}
+
+;; Check whether SLP can find a reduction phi whose incoming blocks are not
+;; the same as the block containing the phi.
+;;
+;; Came from code like,
+;;
+;; int s = 0;
+;; for (int j = 0; j < h; j++) {
+;;   s += p1[0] * p2[0]
+;;   s += p1[1] * p2[1];
+;;   s += p1[2] * p2[2];
+;;   s += p1[3] * p2[3];
+;;   if (s >= lim)
+;;      break;
+;;   p1 += lx;
+;;   p2 += lx;
+;; }
+define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) {
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        reduction_with_br
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-11'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '3'
+; CHECK-LABEL: @reduction_with_br(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_16:%.*]] = icmp sgt i32 [[H:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_16]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[S_020:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[J_019:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[P2_018:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR16:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[P1_017:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P1_017]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[S_020]]
+; CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD]], undef
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD5]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP5]], [[S_020]]
+; CHECK-NEXT:    [[ADD13:%.*]] = add nsw i32 [[ADD9]], undef
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP14]], label [[IF_END]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_017]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR16]] = getelementptr inbounds i32, i32* [[P2_018]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_019]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[H]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[S_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[S_1]]
+;
+entry:
+  %cmp.16 = icmp sgt i32 %h, 0
+  br i1 %cmp.16, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %idx.ext = sext i32 %lx to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %if.end
+  %s.020 = phi i32 [ 0, %for.body.lr.ph ], [ %add13, %if.end ]
+  %j.019 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end ]
+  %p2.018 = phi i32* [ %blk2, %for.body.lr.ph ], [ %add.ptr16, %if.end ]
+  %p1.017 = phi i32* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end ]
+  %0 = load i32, i32* %p1.017, align 4
+  %1 = load i32, i32* %p2.018, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %s.020
+  %arrayidx2 = getelementptr inbounds i32, i32* %p1.017, i64 1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %p2.018, i64 1
+  %3 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %3, %2
+  %add5 = add nsw i32 %add, %mul4
+  %arrayidx6 = getelementptr inbounds i32, i32* %p1.017, i64 2
+  %4 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %p2.018, i64 2
+  %5 = load i32, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i32 %5, %4
+  %add9 = add nsw i32 %add5, %mul8
+  %arrayidx10 = getelementptr inbounds i32, i32* %p1.017, i64 3
+  %6 = load i32, i32* %arrayidx10, align 4
+  %arrayidx11 = getelementptr inbounds i32, i32* %p2.018, i64 3
+  %7 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %7, %6
+  %add13 = add nsw i32 %add9, %mul12
+  %cmp14 = icmp slt i32 %add13, %lim
+  br i1 %cmp14, label %if.end, label %for.end.loopexit
+
+if.end:                                           ; preds = %for.body
+  %add.ptr = getelementptr inbounds i32, i32* %p1.017, i64 %idx.ext
+  %add.ptr16 = getelementptr inbounds i32, i32* %p2.018, i64 %idx.ext
+  %inc = add nuw nsw i32 %j.019, 1
+  %cmp = icmp slt i32 %inc, %h
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body, %if.end
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %s.1 = phi i32 [ 0, %entry ], [ %add13, %for.end.loopexit ]
+  ret i32 %s.1
+}
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            slp-vectorizer
+; YAML-NEXT: Name:            VectorizedHorizontalReduction
+; YAML-NEXT: Function:        test_unrolled_select
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+; YAML-NEXT:   - Cost:            '-47'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '10'
+
+define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
+; CHECK-LABEL: @test_unrolled_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_43:%.*]] = icmp sgt i32 [[H:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_43]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[S_047:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[IF_END_86:%.*]] ]
+; CHECK-NEXT:    [[J_046:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END_86]] ]
+; CHECK-NEXT:    [[P2_045:%.*]] = phi i8* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR88:%.*]], [[IF_END_86]] ]
+; CHECK-NEXT:    [[P1_044:%.*]] = phi i8* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END_86]] ]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX50:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX63:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P1_044]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[P2_045]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, [[S_047]]
+; CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[ADD]], undef
+; CHECK-NEXT:    [[ADD27:%.*]] = add nsw i32 [[ADD16]], undef
+; CHECK-NEXT:    [[ADD38:%.*]] = add nsw i32 [[ADD27]], undef
+; CHECK-NEXT:    [[ADD49:%.*]] = add nsw i32 [[ADD38]], undef
+; CHECK-NEXT:    [[ADD60:%.*]] = add nsw i32 [[ADD49]], undef
+; CHECK-NEXT:    [[ADD71:%.*]] = add nsw i32 [[ADD60]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP9]])
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP10]], [[S_047]]
+; CHECK-NEXT:    [[ADD82:%.*]] = add nsw i32 [[ADD71]], undef
+; CHECK-NEXT:    [[CMP83:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]]
+; CHECK-NEXT:    br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       if.end.86:
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[P1_044]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ADD_PTR88]] = getelementptr inbounds i8, i8* [[P2_045]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[J_046]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[H]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[S_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[S_1]]
+;
+entry:
+  %cmp.43 = icmp sgt i32 %h, 0
+  br i1 %cmp.43, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %idx.ext = sext i32 %lx to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %if.end.86
+  %s.047 = phi i32 [ 0, %for.body.lr.ph ], [ %add82, %if.end.86 ]
+  %j.046 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %if.end.86 ]
+  %p2.045 = phi i8* [ %blk2, %for.body.lr.ph ], [ %add.ptr88, %if.end.86 ]
+  %p1.044 = phi i8* [ %blk1, %for.body.lr.ph ], [ %add.ptr, %if.end.86 ]
+  %0 = load i8, i8* %p1.044, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, i8* %p2.045, align 1
+  %conv2 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %cmp3 = icmp slt i32 %sub, 0
+  %sub5 = sub nsw i32 0, %sub
+  %sub5.sub = select i1 %cmp3, i32 %sub5, i32 %sub
+  %add = add nsw i32 %sub5.sub, %s.047
+  %arrayidx6 = getelementptr inbounds i8, i8* %p1.044, i64 1
+  %2 = load i8, i8* %arrayidx6, align 1
+  %conv7 = zext i8 %2 to i32
+  %arrayidx8 = getelementptr inbounds i8, i8* %p2.045, i64 1
+  %3 = load i8, i8* %arrayidx8, align 1
+  %conv9 = zext i8 %3 to i32
+  %sub10 = sub nsw i32 %conv7, %conv9
+  %cmp11 = icmp slt i32 %sub10, 0
+  %sub14 = sub nsw i32 0, %sub10
+  %v.1 = select i1 %cmp11, i32 %sub14, i32 %sub10
+  %add16 = add nsw i32 %add, %v.1
+  %arrayidx17 = getelementptr inbounds i8, i8* %p1.044, i64 2
+  %4 = load i8, i8* %arrayidx17, align 1
+  %conv18 = zext i8 %4 to i32
+  %arrayidx19 = getelementptr inbounds i8, i8* %p2.045, i64 2
+  %5 = load i8, i8* %arrayidx19, align 1
+  %conv20 = zext i8 %5 to i32
+  %sub21 = sub nsw i32 %conv18, %conv20
+  %cmp22 = icmp slt i32 %sub21, 0
+  %sub25 = sub nsw i32 0, %sub21
+  %sub25.sub21 = select i1 %cmp22, i32 %sub25, i32 %sub21
+  %add27 = add nsw i32 %add16, %sub25.sub21
+  %arrayidx28 = getelementptr inbounds i8, i8* %p1.044, i64 3
+  %6 = load i8, i8* %arrayidx28, align 1
+  %conv29 = zext i8 %6 to i32
+  %arrayidx30 = getelementptr inbounds i8, i8* %p2.045, i64 3
+  %7 = load i8, i8* %arrayidx30, align 1
+  %conv31 = zext i8 %7 to i32
+  %sub32 = sub nsw i32 %conv29, %conv31
+  %cmp33 = icmp slt i32 %sub32, 0
+  %sub36 = sub nsw i32 0, %sub32
+  %v.3 = select i1 %cmp33, i32 %sub36, i32 %sub32
+  %add38 = add nsw i32 %add27, %v.3
+  %arrayidx39 = getelementptr inbounds i8, i8* %p1.044, i64 4
+  %8 = load i8, i8* %arrayidx39, align 1
+  %conv40 = zext i8 %8 to i32
+  %arrayidx41 = getelementptr inbounds i8, i8* %p2.045, i64 4
+  %9 = load i8, i8* %arrayidx41, align 1
+  %conv42 = zext i8 %9 to i32
+  %sub43 = sub nsw i32 %conv40, %conv42
+  %cmp44 = icmp slt i32 %sub43, 0
+  %sub47 = sub nsw i32 0, %sub43
+  %sub47.sub43 = select i1 %cmp44, i32 %sub47, i32 %sub43
+  %add49 = add nsw i32 %add38, %sub47.sub43
+  %arrayidx50 = getelementptr inbounds i8, i8* %p1.044, i64 5
+  %10 = load i8, i8* %arrayidx50, align 1
+  %conv51 = zext i8 %10 to i32
+  %arrayidx52 = getelementptr inbounds i8, i8* %p2.045, i64 5
+  %11 = load i8, i8* %arrayidx52, align 1
+  %conv53 = zext i8 %11 to i32
+  %sub54 = sub nsw i32 %conv51, %conv53
+  %cmp55 = icmp slt i32 %sub54, 0
+  %sub58 = sub nsw i32 0, %sub54
+  %v.5 = select i1 %cmp55, i32 %sub58, i32 %sub54
+  %add60 = add nsw i32 %add49, %v.5
+  %arrayidx61 = getelementptr inbounds i8, i8* %p1.044, i64 6
+  %12 = load i8, i8* %arrayidx61, align 1
+  %conv62 = zext i8 %12 to i32
+  %arrayidx63 = getelementptr inbounds i8, i8* %p2.045, i64 6
+  %13 = load i8, i8* %arrayidx63, align 1
+  %conv64 = zext i8 %13 to i32
+  %sub65 = sub nsw i32 %conv62, %conv64
+  %cmp66 = icmp slt i32 %sub65, 0
+  %sub69 = sub nsw i32 0, %sub65
+  %sub69.sub65 = select i1 %cmp66, i32 %sub69, i32 %sub65
+  %add71 = add nsw i32 %add60, %sub69.sub65
+  %arrayidx72 = getelementptr inbounds i8, i8* %p1.044, i64 7
+  %14 = load i8, i8* %arrayidx72, align 1
+  %conv73 = zext i8 %14 to i32
+  %arrayidx74 = getelementptr inbounds i8, i8* %p2.045, i64 7
+  %15 = load i8, i8* %arrayidx74, align 1
+  %conv75 = zext i8 %15 to i32
+  %sub76 = sub nsw i32 %conv73, %conv75
+  %cmp77 = icmp slt i32 %sub76, 0
+  %sub80 = sub nsw i32 0, %sub76
+  %v.7 = select i1 %cmp77, i32 %sub80, i32 %sub76
+  %add82 = add nsw i32 %add71, %v.7
+  %cmp83 = icmp slt i32 %add82, %lim
+  br i1 %cmp83, label %if.end.86, label %for.end.loopexit
+
+if.end.86:                                        ; preds = %for.body
+  %add.ptr = getelementptr inbounds i8, i8* %p1.044, i64 %idx.ext
+  %add.ptr88 = getelementptr inbounds i8, i8* %p2.045, i64 %idx.ext
+  %inc = add nuw nsw i32 %j.046, 1
+  %cmp = icmp slt i32 %inc, %h
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body, %if.end.86
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %s.1 = phi i32 [ 0, %entry ], [ %add82, %for.end.loopexit ]
+  ret i32 %s.1
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
new file mode 100644
index 00000000000..7184443994b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
new file mode 100644
index 00000000000..6ff1118d3cc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/load-store-q.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S -basicaa -slp-vectorizer < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+; Holding a value live over a call boundary may require
+; spills and fills. This is the case for <2 x double>,
+; as it occupies a Q register of which there are no
+; callee-saves.
+ 
+; CHECK: load double
+; CHECK: load double
+; CHECK: call void @g
+; CHECK: store double
+; CHECK: store double
+define void @f(double* %p, double* %q) {
+  %addr2 = getelementptr double, double* %q, i32 1
+  %addr = getelementptr double, double* %p, i32 1
+  %x = load double, double* %p
+  %y = load double, double* %addr
+  call void @g()
+  store double %x, double* %q
+  store double %y, double* %addr2
+  ret void
+}
+declare void @g()
+
+; Check we deal with loops correctly.
+;
+; CHECK: store <2 x double>
+; CHECK: load <2 x double>
+define void @f2(double* %p, double* %q) {
+entry:
+  br label %loop
+
+loop:
+  %p1 = phi double [0.0, %entry], [%x, %loop]
+  %p2 = phi double [0.0, %entry], [%y, %loop]
+  %addr2 = getelementptr double, double* %q, i32 1
+  %addr = getelementptr double, double* %p, i32 1
+  store double %p1, double* %q
+  store double %p2, double* %addr2
+
+  %x = load double, double* %p
+  %y = load double, double* %addr
+  br label %loop
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
new file mode 100644
index 00000000000..fdd12c1e807
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; This test is reduced from the matrix multiplication benchmark in the test-suite:
+; https://github.com/llvm/llvm-test-suite/tree/master/SingleSource/Benchmarks/Misc/matmul_f64_4x4.c
+; The operations here are expected to be vectorized to <2 x double>.
+; Otherwise, performance will suffer on Cortex-A53.
+
+define void @wrap_mul4(double* nocapture %Out, [2 x double]* nocapture readonly %A, [4 x double]* nocapture readonly %B) {
+; CHECK-LABEL: @wrap_mul4(
+; CHECK-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[TEMP:%.*]] = load double, double* [[ARRAYIDX1_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 0, i64 1
+; CHECK-NEXT:    [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0
+; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TEMP]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> undef, double [[TEMP2]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2
+; CHECK-NEXT:    [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>*
+; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0
+; CHECK-NEXT:    [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8
+; CHECK-NEXT:    [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1
+; CHECK-NEXT:    [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TEMP10]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> undef, double [[TEMP11]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]]
+; CHECK-NEXT:    [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double* [[OUT]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP29]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2
+; CHECK-NEXT:    [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP18]], <2 x double>* [[TMP30]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4
+; CHECK-NEXT:    [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 5
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP25]], <2 x double>* [[TMP31]], align 8
+; CHECK-NEXT:    [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6
+; CHECK-NEXT:    [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 7
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP28]], <2 x double>* [[TMP32]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0
+  %temp = load double, double* %arrayidx1.i, align 8
+  %arrayidx3.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 0
+  %temp1 = load double, double* %arrayidx3.i, align 8
+  %mul.i = fmul double %temp, %temp1
+  %arrayidx5.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 1
+  %temp2 = load double, double* %arrayidx5.i, align 8
+  %arrayidx7.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 0
+  %temp3 = load double, double* %arrayidx7.i, align 8
+  %mul8.i = fmul double %temp2, %temp3
+  %add.i = fadd double %mul.i, %mul8.i
+  %arrayidx13.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 1
+  %temp4 = load double, double* %arrayidx13.i, align 8
+  %mul14.i = fmul double %temp, %temp4
+  %arrayidx18.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 1
+  %temp5 = load double, double* %arrayidx18.i, align 8
+  %mul19.i = fmul double %temp2, %temp5
+  %add20.i = fadd double %mul14.i, %mul19.i
+  %arrayidx25.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 2
+  %temp6 = load double, double* %arrayidx25.i, align 8
+  %mul26.i = fmul double %temp, %temp6
+  %arrayidx30.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 2
+  %temp7 = load double, double* %arrayidx30.i, align 8
+  %mul31.i = fmul double %temp2, %temp7
+  %add32.i = fadd double %mul26.i, %mul31.i
+  %arrayidx37.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 0, i64 3
+  %temp8 = load double, double* %arrayidx37.i, align 8
+  %mul38.i = fmul double %temp, %temp8
+  %arrayidx42.i = getelementptr inbounds [4 x double], [4 x double]* %B, i64 1, i64 3
+  %temp9 = load double, double* %arrayidx42.i, align 8
+  %mul43.i = fmul double %temp2, %temp9
+  %add44.i = fadd double %mul38.i, %mul43.i
+  %arrayidx47.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 1, i64 0
+  %temp10 = load double, double* %arrayidx47.i, align 8
+  %mul50.i = fmul double %temp1, %temp10
+  %arrayidx52.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 1, i64 1
+  %temp11 = load double, double* %arrayidx52.i, align 8
+  %mul55.i = fmul double %temp3, %temp11
+  %add56.i = fadd double %mul50.i, %mul55.i
+  %mul62.i = fmul double %temp4, %temp10
+  %mul67.i = fmul double %temp5, %temp11
+  %add68.i = fadd double %mul62.i, %mul67.i
+  %mul74.i = fmul double %temp6, %temp10
+  %mul79.i = fmul double %temp7, %temp11
+  %add80.i = fadd double %mul74.i, %mul79.i
+  %mul86.i = fmul double %temp8, %temp10
+  %mul91.i = fmul double %temp9, %temp11
+  %add92.i = fadd double %mul86.i, %mul91.i
+  store double %add.i, double* %Out, align 8
+  %Res.i.sroa.4.0.Out2.i.sroa_idx2 = getelementptr inbounds double, double* %Out, i64 1
+  store double %add20.i, double* %Res.i.sroa.4.0.Out2.i.sroa_idx2, align 8
+  %Res.i.sroa.5.0.Out2.i.sroa_idx4 = getelementptr inbounds double, double* %Out, i64 2
+  store double %add32.i, double* %Res.i.sroa.5.0.Out2.i.sroa_idx4, align 8
+  %Res.i.sroa.6.0.Out2.i.sroa_idx6 = getelementptr inbounds double, double* %Out, i64 3
+  store double %add44.i, double* %Res.i.sroa.6.0.Out2.i.sroa_idx6, align 8
+  %Res.i.sroa.7.0.Out2.i.sroa_idx8 = getelementptr inbounds double, double* %Out, i64 4
+  store double %add56.i, double* %Res.i.sroa.7.0.Out2.i.sroa_idx8, align 8
+  %Res.i.sroa.8.0.Out2.i.sroa_idx10 = getelementptr inbounds double, double* %Out, i64 5
+  store double %add68.i, double* %Res.i.sroa.8.0.Out2.i.sroa_idx10, align 8
+  %Res.i.sroa.9.0.Out2.i.sroa_idx12 = getelementptr inbounds double, double* %Out, i64 6
+  store double %add80.i, double* %Res.i.sroa.9.0.Out2.i.sroa_idx12, align 8
+  %Res.i.sroa.10.0.Out2.i.sroa_idx14 = getelementptr inbounds double, double* %Out, i64 7
+  store double %add92.i, double* %Res.i.sroa.10.0.Out2.i.sroa_idx14, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
new file mode 100644
index 00000000000..0429665c0ae
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; This test ensures that we do not regress due to PR26364. The vectorizer
+; should not compute a smaller size for %k.13 since it is in a use-def cycle
+; and cannot be demoted.
+;
+define fastcc void @PR26364() {
+; CHECK-LABEL: @PR26364(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END11:%.*]], label [[FOR_COND4:%.*]]
+; CHECK:       for.cond4:
+; CHECK-NEXT:    [[K_13:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[K_3:%.*]], [[FOR_COND4]] ]
+; CHECK-NEXT:    [[E_02:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 0, [[FOR_COND4]] ]
+; CHECK-NEXT:    [[E_1:%.*]] = select i1 undef, i32 [[E_02]], i32 0
+; CHECK-NEXT:    [[K_3]] = select i1 undef, i32 [[K_13]], i32 undef
+; CHECK-NEXT:    br label [[FOR_COND4]]
+; CHECK:       for.end11:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %for.end11, label %for.cond4
+
+for.cond4:
+  %k.13 = phi i32 [ undef, %entry ], [ %k.3, %for.cond4 ]
+  %e.02 = phi i32 [ 1, %entry ], [ 0, %for.cond4 ]
+  %e.1 = select i1 undef, i32 %e.02, i32 0
+  %k.3 = select i1 undef, i32 %k.13, i32 undef
+  br label %for.cond4
+
+for.end11:
+  ret void
+}
+
+; This test ensures that we do not regress due to PR26629. We must look at
+; every root in the vectorizable tree when computing minimum sizes since one
+; root may require fewer bits than another.
+;
+define void @PR26629(i32* %c) {
+; CHECK-LABEL: @PR26629(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C:%.*]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[D:%.*]] = phi i72 [ 576507472957710340, [[FOR_PH]] ], [ [[BF_SET17:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP0]], undef
+; CHECK-NEXT:    [[BF_CLEAR13:%.*]] = and i72 [[D]], -576460748008464384
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[SUB]] to i72
+; CHECK-NEXT:    [[BF_VALUE15:%.*]] = and i72 [[TMP1]], 8191
+; CHECK-NEXT:    [[BF_CLEAR16:%.*]] = or i72 [[BF_VALUE15]], [[BF_CLEAR13]]
+; CHECK-NEXT:    [[BF_SET17]] = or i72 [[BF_CLEAR16]], undef
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %for.ph, label %for.end
+
+for.ph:
+  %0 = load i32, i32* %c, align 4
+  br label %for.body
+
+for.body:
+  %d = phi i72 [ 576507472957710340, %for.ph ], [ %bf.set17, %for.body ]
+  %sub = sub i32 %0, undef
+  %bf.clear13 = and i72 %d, -576460748008464384
+  %1 = zext i32 %sub to i72
+  %bf.value15 = and i72 %1, 8191
+  %bf.clear16 = or i72 %bf.value15, %bf.clear13
+  %bf.set17 = or i72 %bf.clear16, undef
+  br label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
new file mode 100644
index 00000000000..64b8743f8bd
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind {
+; CHECK-LABEL: @mismatched_intrinsics(
+; CHECK-NEXT:    [[VADDLVQ_S32_I:%.*]] = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> [[IN1:%.*]])
+; CHECK-NEXT:    [[VADDLV_S32_I:%.*]] = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> [[IN2:%.*]])
+; CHECK-NEXT:    [[TST:%.*]] = icmp sgt i64 [[VADDLVQ_S32_I]], [[VADDLV_S32_I]]
+; CHECK-NEXT:    [[EQUAL:%.*]] = sext i1 [[TST]] to i64
+; CHECK-NEXT:    ret i64 [[EQUAL]]
+;
+
+  %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2
+  %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2
+  %tst = icmp sgt i64 %vaddlvq_s32.i, %vaddlv_s32.i
+  %equal = sext i1 %tst to i64
+  ret i64 %equal
+}
+
+declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1)
+declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in1)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll
new file mode 100644
index 00000000000..98c033262a0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -basicaa -slp-vectorizer -dce < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios5.0.0"
+
+define void @foo(float* noalias %a, float* noalias %b, float* noalias %c) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !nontemporal !0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4, !nontemporal !0
+; CHECK-NEXT:    ret void
+;
+entry:
+; Check that we don't lose !nontemporal hint when vectorizing loads.
+  %b1 = load float, float* %b, align 4, !nontemporal !0
+  %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1
+  %b2 = load float, float* %arrayidx.1, align 4, !nontemporal !0
+  %arrayidx.2 = getelementptr inbounds float, float* %b, i64 2
+  %b3 = load float, float* %arrayidx.2, align 4, !nontemporal !0
+  %arrayidx.3 = getelementptr inbounds float, float* %b, i64 3
+  %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0
+
+; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it.
+  %c1 = load float, float* %c, align 4
+  %arrayidx2.1 = getelementptr inbounds float, float* %c, i64 1
+  %c2 = load float, float* %arrayidx2.1, align 4
+  %arrayidx2.2 = getelementptr inbounds float, float* %c, i64 2
+  %c3 = load float, float* %arrayidx2.2, align 4
+  %arrayidx2.3 = getelementptr inbounds float, float* %c, i64 3
+  %c4 = load float, float* %arrayidx2.3, align 4
+
+  %a1 = fadd float %b1, %c1
+  %a2 = fadd float %b2, %c2
+  %a3 = fadd float %b3, %c3
+  %a4 = fadd float %b4, %c4
+
+; Check that we don't lose !nontemporal hint when vectorizing stores.
+  store float %a1, float* %a, align 4, !nontemporal !0
+  %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1
+  store float %a2, float* %arrayidx3.1, align 4, !nontemporal !0
+  %arrayidx3.2 = getelementptr inbounds float, float* %a, i64 2
+  store float %a3, float* %arrayidx3.2, align 4, !nontemporal !0
+  %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3
+  store float %a4, float* %arrayidx3.3, align 4, !nontemporal !0
+
+  ret void
+}
+
+define void @foo2(float* noalias %a, float* noalias %b) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+; Check that we don't mark vector load with !nontemporal attribute if some of
+; the original scalar loads don't have it.
+  %b1 = load float, float* %b, align 4, !nontemporal !0
+  %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1
+  %b2 = load float, float* %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds float, float* %b, i64 2
+  %b3 = load float, float* %arrayidx.2, align 4
+  %arrayidx.3 = getelementptr inbounds float, float* %b, i64 3
+  %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0
+
+; Check that we don't mark vector store with !nontemporal attribute if some of
+; the original scalar stores don't have it.
+  store float %b1, float* %a, align 4, !nontemporal !0
+  %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1
+  store float %b2, float* %arrayidx3.1, align 4
+  %arrayidx3.2 = getelementptr inbounds float, float* %a, i64 2
+  store float %b3, float* %arrayidx3.2, align 4
+  %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3
+  store float %b4, float* %arrayidx3.3, align 4, !nontemporal !0
+
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
new file mode 100644
index 00000000000..ba089bc0602
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/remarks.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
+
+define void @f(double* %r, double* %w) {
+  %r0 = getelementptr inbounds double, double* %r, i64 0
+  %r1 = getelementptr inbounds double, double* %r, i64 1
+  %f0 = load double, double* %r0
+  %f1 = load double, double* %r1
+  %add0 = fadd double %f0, %f0
+  %add1 = fadd double %f1, %f1
+  %w0 = getelementptr inbounds double, double* %w, i64 0
+  %w1 = getelementptr inbounds double, double* %w, i64 1
+; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
+  store double %add0, double* %w0, !dbg !9
+  store double %add1, double* %w1
+  ret void
+}
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
+!7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 10, scope: !7)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
new file mode 100644
index 00000000000..8311f205149
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = load i32, i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 1
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 2
+  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 2
+  %4 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 2
+  %5 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 2
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 3
+  %6 = load i32, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 3
+  %7 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 2
+  %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
new file mode 100644
index 00000000000..98a9fd482e8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Debug informations shouldn't affect spill cost.
+; RUN: opt -S -slp-vectorizer %s -o - | FileCheck %s
+
+target triple = "aarch64"
+
+%struct.S = type { i64, i64 }
+
+define void @patatino(i64 %n, i64 %i, %struct.S* %p) !dbg !7 {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[N:%.*]], metadata !18, metadata !DIExpression()), !dbg !23
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[I:%.*]], metadata !19, metadata !DIExpression()), !dbg !24
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata %struct.S* [[P:%.*]], metadata !20, metadata !DIExpression()), !dbg !25
+; CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg !26
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata !21, metadata !DIExpression()), !dbg !27
+; CHECK-NEXT:    [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg !28
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg !26
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg !26, !tbaa !29
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 undef, metadata !22, metadata !DIExpression()), !dbg !33
+; CHECK-NEXT:    [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg !34
+; CHECK-NEXT:    [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg !35
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg !36
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg !36, !tbaa !29
+; CHECK-NEXT:    ret void, !dbg !37
+;
+entry:
+  call void @llvm.dbg.value(metadata i64 %n, metadata !18, metadata !DIExpression()), !dbg !23
+  call void @llvm.dbg.value(metadata i64 %i, metadata !19, metadata !DIExpression()), !dbg !24
+  call void @llvm.dbg.value(metadata %struct.S* %p, metadata !20, metadata !DIExpression()), !dbg !25
+  %x1 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %n, i32 0, !dbg !26
+  %0 = load i64, i64* %x1, align 8, !dbg !26, !tbaa !27
+  call void @llvm.dbg.value(metadata i64 %0, metadata !21, metadata !DIExpression()), !dbg !32
+  %y3 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %n, i32 1, !dbg !33
+  %1 = load i64, i64* %y3, align 8, !dbg !33, !tbaa !34
+  call void @llvm.dbg.value(metadata i64 %1, metadata !22, metadata !DIExpression()), !dbg !35
+  %x5 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %i, i32 0, !dbg !36
+  store i64 %0, i64* %x5, align 8, !dbg !37, !tbaa !27
+  %y7 = getelementptr inbounds %struct.S, %struct.S* %p, i64 %i, i32 1, !dbg !38
+  store i64 %1, i64* %y7, align 8, !dbg !39, !tbaa !34
+  ret void, !dbg !40
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 7.0.0 (trunk 330946) (llvm/trunk 330976)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "slp-reduced.c", directory: "/usr2/gberry/local/loop-align")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 7.0.0 (trunk 330946) (llvm/trunk 330976)"}
+!7 = distinct !DISubprogram(name: "patatino", scope: !1, file: !1, line: 6, type: !8, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10, !10, !11}
+!10 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_typedef, name: "S", file: !1, line: 4, baseType: !13)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, file: !1, line: 1, size: 128, elements: !14)
+!14 = !{!15, !16}
+!15 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !13, file: !1, line: 2, baseType: !10, size: 64)
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !13, file: !1, line: 3, baseType: !10, size: 64, offset: 64)
+!17 = !{!18, !19, !20, !21, !22}
+!18 = !DILocalVariable(name: "n", arg: 1, scope: !7, file: !1, line: 6, type: !10)
+!19 = !DILocalVariable(name: "i", arg: 2, scope: !7, file: !1, line: 6, type: !10)
+!20 = !DILocalVariable(name: "p", arg: 3, scope: !7, file: !1, line: 6, type: !11)
+!21 = !DILocalVariable(name: "x", scope: !7, file: !1, line: 7, type: !10)
+!22 = !DILocalVariable(name: "y", scope: !7, file: !1, line: 8, type: !10)
+!23 = !DILocation(line: 6, column: 15, scope: !7)
+!24 = !DILocation(line: 6, column: 23, scope: !7)
+!25 = !DILocation(line: 6, column: 29, scope: !7)
+!26 = !DILocation(line: 7, column: 19, scope: !7)
+!27 = !{!28, !29, i64 0}
+!28 = !{!"", !29, i64 0, !29, i64 8}
+!29 = !{!"long", !30, i64 0}
+!30 = !{!"omnipotent char", !31, i64 0}
+!31 = !{!"Simple C/C++ TBAA"}
+!32 = !DILocation(line: 7, column: 10, scope: !7)
+!33 = !DILocation(line: 8, column: 19, scope: !7)
+!34 = !{!28, !29, i64 8}
+!35 = !DILocation(line: 8, column: 10, scope: !7)
+!36 = !DILocation(line: 9, column: 10, scope: !7)
+!37 = !DILocation(line: 9, column: 12, scope: !7)
+!38 = !DILocation(line: 10, column: 10, scope: !7)
+!39 = !DILocation(line: 10, column: 12, scope: !7)
+!40 = !DILocation(line: 11, column: 1, scope: !7)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
new file mode 100644
index 00000000000..9e9f40825ef
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) {
+; CHECK-LABEL: @build_vec_v2i64(
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    [[TMP3_0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2_0]], i32 0
+; CHECK-NEXT:    [[TMP3_1:%.*]] = insertelement <2 x i64> [[TMP3_0]], i64 [[TMP2_1]], i32 1
+; CHECK-NEXT:    ret <2 x i64> [[TMP3_1]]
+;
+  %v0.0 = extractelement <2 x i64> %v0, i32 0
+  %v0.1 = extractelement <2 x i64> %v0, i32 1
+  %v1.0 = extractelement <2 x i64> %v1, i32 0
+  %v1.1 = extractelement <2 x i64> %v1, i32 1
+  %tmp0.0 = add i64 %v0.0, %v1.0
+  %tmp0.1 = add i64 %v0.1, %v1.1
+  %tmp1.0 = sub i64 %v0.0, %v1.0
+  %tmp1.1 = sub i64 %v0.1, %v1.1
+  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
+  %tmp3.0 = insertelement <2 x i64> undef, i64 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <2 x i64> %tmp3.0, i64 %tmp2.1, i32 1
+  ret <2 x i64> %tmp3.1
+}
+
+define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) {
+; CHECK-LABEL: @store_chain_v2i64(
+; CHECK-NEXT:    [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[V0_0:%.*]] = load i64, i64* [[A]], align 8
+; CHECK-NEXT:    [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8
+; CHECK-NEXT:    [[V1_0:%.*]] = load i64, i64* [[B]], align 8
+; CHECK-NEXT:    [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]]
+; CHECK-NEXT:    [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]]
+; CHECK-NEXT:    store i64 [[TMP2_0]], i64* [[C]], align 8
+; CHECK-NEXT:    store i64 [[TMP2_1]], i64* [[C_1]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a.0 = getelementptr i64, i64* %a, i64 0
+  %a.1 = getelementptr i64, i64* %a, i64 1
+  %b.0 = getelementptr i64, i64* %b, i64 0
+  %b.1 = getelementptr i64, i64* %b, i64 1
+  %c.0 = getelementptr i64, i64* %c, i64 0
+  %c.1 = getelementptr i64, i64* %c, i64 1
+  %v0.0 = load i64, i64* %a.0, align 8
+  %v0.1 = load i64, i64* %a.1, align 8
+  %v1.0 = load i64, i64* %b.0, align 8
+  %v1.1 = load i64, i64* %b.1, align 8
+  %tmp0.0 = add i64 %v0.0, %v1.0
+  %tmp0.1 = add i64 %v0.1, %v1.1
+  %tmp1.0 = sub i64 %v0.0, %v1.0
+  %tmp1.1 = sub i64 %v0.1, %v1.1
+  %tmp2.0 = add i64 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i64 %tmp1.0, %tmp1.1
+  store i64 %tmp2.0, i64* %c.0, align 8
+  store i64 %tmp2.1, i64* %c.1, align 8
+  ret void
+}
+
+define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP11]]
+;
+  %v0.0 = extractelement <4 x i32> %v0, i32 0
+  %v0.1 = extractelement <4 x i32> %v0, i32 1
+  %v0.2 = extractelement <4 x i32> %v0, i32 2
+  %v0.3 = extractelement <4 x i32> %v0, i32 3
+  %v1.0 = extractelement <4 x i32> %v1, i32 0
+  %v1.1 = extractelement <4 x i32> %v1, i32 1
+  %v1.2 = extractelement <4 x i32> %v1, i32 2
+  %v1.3 = extractelement <4 x i32> %v1, i32 3
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = add i32 %v0.2, %v1.2
+  %tmp0.3 = add i32 %v0.3, %v1.3
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp1.2 = sub i32 %v0.2, %v1.2
+  %tmp1.3 = sub i32 %v0.3, %v1.3
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_reuse_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.0, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.1, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_reuse_1(
+; CHECK-NEXT:    [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1
+; CHECK-NEXT:    [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1
+; CHECK-NEXT:    [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]]
+; CHECK-NEXT:    [[TMP0_3:%.*]] = xor i32 [[V0_1]], [[V1_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1_2:%.*]] = sub i32 [[TMP0_2]], [[TMP0_3]]
+; CHECK-NEXT:    [[TMP1_3:%.*]] = sub i32 [[TMP0_3]], [[TMP0_2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP2_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP2_1:%.*]] = insertelement <4 x i32> [[TMP2_0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP2_2:%.*]] = insertelement <4 x i32> [[TMP2_1]], i32 [[TMP1_2]], i32 2
+; CHECK-NEXT:    [[TMP2_3:%.*]] = insertelement <4 x i32> [[TMP2_2]], i32 [[TMP1_3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[TMP2_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = xor i32 %v0.0, %v1.0
+  %tmp0.3 = xor i32 %v0.1, %v1.1
+  %tmp1.0 = sub i32 %tmp0.0, %tmp0.1
+  %tmp1.1 = sub i32 %tmp0.0, %tmp0.1
+  %tmp1.2 = sub i32 %tmp0.2, %tmp0.3
+  %tmp1.3 = sub i32 %tmp0.3, %tmp0.2
+  %tmp2.0 = insertelement <4 x i32> undef, i32 %tmp1.0, i32 0
+  %tmp2.1 = insertelement <4 x i32> %tmp2.0, i32 %tmp1.1, i32 1
+  %tmp2.2 = insertelement <4 x i32> %tmp2.1, i32 %tmp1.2, i32 2
+  %tmp2.3 = insertelement <4 x i32> %tmp2.2, i32 %tmp1.3, i32 3
+  ret <4 x i32> %tmp2.3
+}
+
+define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) {
+; CHECK-LABEL: @build_vec_v4i32_3_binops(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP3_3]]
+;
+  %v0.0 = extractelement <2 x i32> %v0, i32 0
+  %v0.1 = extractelement <2 x i32> %v0, i32 1
+  %v1.0 = extractelement <2 x i32> %v1, i32 0
+  %v1.1 = extractelement <2 x i32> %v1, i32 1
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = xor i32 %v0.0, %v1.0
+  %tmp0.3 = xor i32 %v0.1, %v1.1
+  %tmp1.0 = mul i32 %v0.0, %v1.0
+  %tmp1.1 = mul i32 %v0.1, %v1.1
+  %tmp1.2 = xor i32 %v0.0, %v1.0
+  %tmp1.3 = xor i32 %v0.1, %v1.1
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = insertelement <4 x i32> undef, i32 %tmp2.0, i32 0
+  %tmp3.1 = insertelement <4 x i32> %tmp3.0, i32 %tmp2.1, i32 1
+  %tmp3.2 = insertelement <4 x i32> %tmp3.1, i32 %tmp2.2, i32 2
+  %tmp3.3 = insertelement <4 x i32> %tmp3.2, i32 %tmp2.3, i32 3
+  ret <4 x i32> %tmp3.3
+}
+
+define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: @reduction_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], <i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <4 x i32> [[TMP12]], <i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw <4 x i32> [[TMP13]], <i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP16]])
+; CHECK-NEXT:    ret i32 [[TMP17]]
+;
+  %v0.0 = extractelement <4 x i32> %v0, i32 0
+  %v0.1 = extractelement <4 x i32> %v0, i32 1
+  %v0.2 = extractelement <4 x i32> %v0, i32 2
+  %v0.3 = extractelement <4 x i32> %v0, i32 3
+  %v1.0 = extractelement <4 x i32> %v1, i32 0
+  %v1.1 = extractelement <4 x i32> %v1, i32 1
+  %v1.2 = extractelement <4 x i32> %v1, i32 2
+  %v1.3 = extractelement <4 x i32> %v1, i32 3
+  %tmp0.0 = add i32 %v0.0, %v1.0
+  %tmp0.1 = add i32 %v0.1, %v1.1
+  %tmp0.2 = add i32 %v0.2, %v1.2
+  %tmp0.3 = add i32 %v0.3, %v1.3
+  %tmp1.0 = sub i32 %v0.0, %v1.0
+  %tmp1.1 = sub i32 %v0.1, %v1.1
+  %tmp1.2 = sub i32 %v0.2, %v1.2
+  %tmp1.3 = sub i32 %v0.3, %v1.3
+  %tmp2.0 = add i32 %tmp0.0, %tmp0.1
+  %tmp2.1 = add i32 %tmp1.0, %tmp1.1
+  %tmp2.2 = add i32 %tmp0.2, %tmp0.3
+  %tmp2.3 = add i32 %tmp1.2, %tmp1.3
+  %tmp3.0 = lshr i32 %tmp2.0, 15
+  %tmp3.1 = lshr i32 %tmp2.1, 15
+  %tmp3.2 = lshr i32 %tmp2.2, 15
+  %tmp3.3 = lshr i32 %tmp2.3, 15
+  %tmp4.0 = and i32 %tmp3.0, 65537
+  %tmp4.1 = and i32 %tmp3.1, 65537
+  %tmp4.2 = and i32 %tmp3.2, 65537
+  %tmp4.3 = and i32 %tmp3.3, 65537
+  %tmp5.0 = mul nuw i32 %tmp4.0, 65535
+  %tmp5.1 = mul nuw i32 %tmp4.1, 65535
+  %tmp5.2 = mul nuw i32 %tmp4.2, 65535
+  %tmp5.3 = mul nuw i32 %tmp4.3, 65535
+  %tmp6.0 = add i32 %tmp5.0, %tmp2.0
+  %tmp6.1 = add i32 %tmp5.1, %tmp2.1
+  %tmp6.2 = add i32 %tmp5.2, %tmp2.2
+  %tmp6.3 = add i32 %tmp5.3, %tmp2.3
+  %tmp7.0 = xor i32 %tmp6.0, %tmp5.0
+  %tmp7.1 = xor i32 %tmp6.1, %tmp5.1
+  %tmp7.2 = xor i32 %tmp6.2, %tmp5.2
+  %tmp7.3 = xor i32 %tmp6.3, %tmp5.3
+  %reduce.0 = add i32 %tmp7.1, %tmp7.0
+  %reduce.1 = add i32 %reduce.0, %tmp7.2
+  %reduce.2 = add i32 %reduce.1, %tmp7.3
+  ret i32 %reduce.2
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
new file mode 100644
index 00000000000..6d7a6a200dd
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+; This test is reduced from the TSVC evaluation of vectorizers:
+; https://github.com/llvm/llvm-test-suite/commits/master/MultiSource/Benchmarks/TSVC/LoopRerolling-flt/tsc.c
+; Two loads and an fmul are expected to be vectorized to <2 x float>.
+; Otherwise, performance will suffer on Cortex-A53.
+; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details.
+
+%struct.GlobalData = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float], [5 x i32], [12 x i8], [32000 x float], [7 x i32], [4 x i8], [32000 x float], [11 x i32], [4 x i8], [32000 x float], [13 x i32], [12 x i8], [256 x [256 x float]], [17 x i32], [12 x i8], [256 x [256 x float]], [19 x i32], [4 x i8], [256 x [256 x float]], [23 x i32], [4 x i8], [256 x [256 x float]] }
+
+@global_data = common dso_local global %struct.GlobalData zeroinitializer, align 16
+
+define i32 @s352() {
+; CHECK-LABEL: @s352(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[NL_017:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       for.cond.cleanup3:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[NL_017]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1600000
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[PREHEADER]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_115:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD39:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[MUL7:%.*]] = fmul float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[DOT_115]], [[MUL7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[MUL14:%.*]] = fmul float [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd float [[ADD]], [[MUL14]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <2 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX34]], align 4
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX37]], align 4
+; CHECK-NEXT:    [[MUL38:%.*]] = fmul float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[ADD39]] = fadd float [[ADD31]], [[MUL38]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 32000
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP3]]
+;
+entry:
+  br label %preheader
+
+preheader:
+  %nl.017 = phi i32 [ 0, %entry ], [ %inc, %for.cond.cleanup3 ]
+  br label %for.body
+
+for.cond.cleanup:
+  ret i32 0
+
+for.cond.cleanup3:
+  %inc = add nuw nsw i32 %nl.017, 1
+  %exitcond = icmp eq i32 %inc, 1600000
+  br i1 %exitcond, label %for.cond.cleanup, label %preheader
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for.body ]
+  %dot.115 = phi float [ 0.000000e+00, %preheader ], [ %add39, %for.body ]
+  %arrayidx = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx6 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %indvars.iv
+  %1 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul float %0, %1
+  %add = fadd float %dot.115, %mul7
+  %2 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx10 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %2
+  %3 = load float, float* %arrayidx10, align 4
+  %arrayidx13 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %2
+  %4 = load float, float* %arrayidx13, align 4
+  %mul14 = fmul float %3, %4
+  %add15 = fadd float %add, %mul14
+  %5 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx18 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %5
+  %6 = load float, float* %arrayidx18, align 4
+  %arrayidx21 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %5
+  %7 = load float, float* %arrayidx21, align 4
+  %mul22 = fmul float %6, %7
+  %add23 = fadd float %add15, %mul22
+  %8 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx26 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %8
+  %9 = load float, float* %arrayidx26, align 4
+  %arrayidx29 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %8
+  %10 = load float, float* %arrayidx29, align 4
+  %mul30 = fmul float %9, %10
+  %add31 = fadd float %add23, %mul30
+  %11 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx34 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %11
+  %12 = load float, float* %arrayidx34, align 4
+  %arrayidx37 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %11
+  %13 = load float, float* %arrayidx37, align 4
+  %mul38 = fmul float %12, %13
+  %add39 = fadd float %add31, %mul38
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %cmp2 = icmp ult i64 %indvars.iv.next, 32000
+  br i1 %cmp2, label %for.body, label %for.cond.cleanup3
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll
new file mode 100644
index 00000000000..735ce651ed7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/address-space-ptr-sze-gep-index-assert.ll
@@ -0,0 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -slp-threshold=-18 < %s | FileCheck %s
+
+; Make sure there's no SCEV assert when the indexes are for different
+; sized address spaces
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+define void @slp_scev_assert(i32 %idx, i64 %tmp3) #0 {
+; CHECK-LABEL: @slp_scev_assert(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = addrspacecast i8 addrspace(5)* undef to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 [[IDX:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP]], i64 [[TMP3:%.*]]
+; CHECK-NEXT:    store i8 0, i8 addrspace(5)* [[TMP2]]
+; CHECK-NEXT:    store i8 0, i8* [[TMP4]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = addrspacecast i8 addrspace(5)* undef to i8*
+  %tmp2 = getelementptr inbounds i8, i8 addrspace(5)* undef, i32 %idx
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i64 %tmp3
+  store i8 0, i8 addrspace(5)* %tmp2
+  store i8 0, i8* %tmp4
+  ret void
+}
+
+define void @multi_as_reduction_different_sized(i32 addrspace(3)* %lds, i32 %idx0, i64 %idx1) #0 {
+; CHECK-LABEL: @multi_as_reduction_different_sized(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FLAT:%.*]] = addrspacecast i32 addrspace(3)* [[LDS:%.*]] to i32*
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[IDX0:%.*]], 2
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
+; CHECK-NEXT:    [[LDS_1:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* [[LDS]], i32 [[ADD0]]
+; CHECK-NEXT:    [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
+; CHECK-NEXT:    [[LOAD_LDS_0:%.*]] = load i32, i32 addrspace(3)* [[LDS]], align 4
+; CHECK-NEXT:    [[LOAD_LDS_1:%.*]] = load i32, i32 addrspace(3)* [[LDS_1]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
+; CHECK-NEXT:    [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_LDS_0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_LDS_1]]
+; CHECK-NEXT:    store i32 [[SUB0]], i32* undef
+; CHECK-NEXT:    store i32 [[SUB1]], i32* undef
+; CHECK-NEXT:    ret void
+;
+bb:
+  %flat = addrspacecast i32 addrspace(3)* %lds to i32*
+  %add0 = add i32 %idx0, 2
+  %add1 = add i64 %idx1, 1
+
+  %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 %add0
+  %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
+
+  %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4
+  %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4
+
+  %load.flat.0 = load i32, i32* %flat, align 4
+  %load.flat.1 = load i32, i32* %flat.1, align 4
+
+  %sub0 = sub i32 %load.flat.0, %load.lds.0
+  %sub1 = sub i32 %load.flat.1, %load.lds.1
+
+  store i32 %sub0, i32* undef
+  store i32 %sub1, i32* undef
+  ret void
+}
+
+; This should vectorize if using GetUnderlyingObject
+define void @multi_as_reduction_same_size(i32 addrspace(1)* %global, i64 %idx0, i64 %idx1) #0 {
+; CHECK-LABEL: @multi_as_reduction_same_size(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FLAT:%.*]] = addrspacecast i32 addrspace(1)* [[GLOBAL:%.*]] to i32*
+; CHECK-NEXT:    [[ADD0:%.*]] = add i64 [[IDX0:%.*]], 2
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
+; CHECK-NEXT:    [[GLOBAL_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[GLOBAL]], i64 [[ADD0]]
+; CHECK-NEXT:    [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
+; CHECK-NEXT:    [[LOAD_GLOBAL_0:%.*]] = load i32, i32 addrspace(1)* [[GLOBAL]], align 4
+; CHECK-NEXT:    [[LOAD_GLOBAL_1:%.*]] = load i32, i32 addrspace(1)* [[GLOBAL_1]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
+; CHECK-NEXT:    [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_GLOBAL_0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_GLOBAL_1]]
+; CHECK-NEXT:    store i32 [[SUB0]], i32* undef
+; CHECK-NEXT:    store i32 [[SUB1]], i32* undef
+; CHECK-NEXT:    ret void
+;
+bb:
+  %flat = addrspacecast i32 addrspace(1)* %global to i32*
+  %add0 = add i64 %idx0, 2
+  %add1 = add i64 %idx1, 1
+
+  %global.1 = getelementptr inbounds i32, i32 addrspace(1)* %global, i64 %add0
+  %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
+
+  %load.global.0 = load i32, i32 addrspace(1)* %global, align 4
+  %load.global.1 = load i32, i32 addrspace(1)* %global.1, align 4
+
+  %load.flat.0 = load i32, i32* %flat, align 4
+  %load.flat.1 = load i32, i32* %flat.1, align 4
+
+  %sub0 = sub i32 %load.flat.0, %load.global.0
+  %sub1 = sub i32 %load.flat.1, %load.global.1
+
+  store i32 %sub0, i32* undef
+  store i32 %sub1, i32* undef
+  ret void
+}
+
+; This should vectorize if using GetUnderlyingObject
+; The add is done in the same width, even though the address space size is smaller
+define void @multi_as_reduction_different_sized_noncanon(i32 addrspace(3)* %lds, i64 %idx0, i64 %idx1) #0 {
+; CHECK-LABEL: @multi_as_reduction_different_sized_noncanon(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FLAT:%.*]] = addrspacecast i32 addrspace(3)* [[LDS:%.*]] to i32*
+; CHECK-NEXT:    [[ADD0:%.*]] = add i64 [[IDX0:%.*]], 2
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[IDX1:%.*]], 1
+; CHECK-NEXT:    [[LDS_1:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* [[LDS]], i64 [[ADD0]]
+; CHECK-NEXT:    [[FLAT_1:%.*]] = getelementptr inbounds i32, i32* [[FLAT]], i64 [[ADD1]]
+; CHECK-NEXT:    [[LOAD_LDS_0:%.*]] = load i32, i32 addrspace(3)* [[LDS]], align 4
+; CHECK-NEXT:    [[LOAD_LDS_1:%.*]] = load i32, i32 addrspace(3)* [[LDS_1]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_0:%.*]] = load i32, i32* [[FLAT]], align 4
+; CHECK-NEXT:    [[LOAD_FLAT_1:%.*]] = load i32, i32* [[FLAT_1]], align 4
+; CHECK-NEXT:    [[SUB0:%.*]] = sub i32 [[LOAD_FLAT_0]], [[LOAD_LDS_0]]
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i32 [[LOAD_FLAT_1]], [[LOAD_LDS_1]]
+; CHECK-NEXT:    store i32 [[SUB0]], i32* undef
+; CHECK-NEXT:    store i32 [[SUB1]], i32* undef
+; CHECK-NEXT:    ret void
+;
+bb:
+  %flat = addrspacecast i32 addrspace(3)* %lds to i32*
+  %add0 = add i64 %idx0, 2
+  %add1 = add i64 %idx1, 1
+
+  %lds.1 = getelementptr inbounds i32, i32 addrspace(3)* %lds, i64 %add0
+  %flat.1 = getelementptr inbounds i32, i32* %flat, i64 %add1
+
+  %load.lds.0 = load i32, i32 addrspace(3)* %lds, align 4
+  %load.lds.1 = load i32, i32 addrspace(3)* %lds.1, align 4
+
+  %load.flat.0 = load i32, i32* %flat, align 4
+  %load.flat.1 = load i32, i32* %flat.1, align 4
+
+  %sub0 = sub i32 %load.flat.0, %load.lds.0
+  %sub1 = sub i32 %load.flat.1, %load.lds.1
+
+  store i32 %sub0, i32* undef
+  store i32 %sub1, i32* undef
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
new file mode 100644
index 00000000000..4007a0d30ed
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
@@ -0,0 +1,250 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -S -slp-threshold=-100 -slp-vectorize-hor-store -dce | FileCheck %s --check-prefix=GFX9
+
+@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
+@arr64 = local_unnamed_addr global [32 x i64] zeroinitializer, align 16
+@var = global i32 zeroinitializer, align 8
+@var64 = global i64 zeroinitializer, align 8
+
+@farr = local_unnamed_addr global [32 x float] zeroinitializer, align 16
+@fvar = global float zeroinitializer, align 8
+
+@darr = local_unnamed_addr global [32 x double] zeroinitializer, align 16
+@dvar = global double zeroinitializer, align 8
+
+; Tests whether the min/max reduction pattern is vectorized if SLP starts at the store.
+define i32 @smaxv6() {
+; GFX9-LABEL: @smaxv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i32 [[TMP5]], i32 [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4
+; GFX9-NEXT:    store i32 [[STORE_SELECT]], i32* @var, align 8
+; GFX9-NEXT:    ret i32 [[OP_EXTRA]]
+;
+  %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+  %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+  %cmp1 = icmp sgt i32 %load1, %load2
+  %select1 = select i1 %cmp1, i32 %load1, i32 %load2
+
+  %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %cmp2 = icmp sgt i32 %select1, %load3
+  %select2 = select i1 %cmp2, i32 %select1, i32 %load3
+
+  %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %cmp3 = icmp sgt i32 %select2, %load4
+  %select3 = select i1 %cmp3, i32 %select2, i32 %load4
+
+  %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %cmp4 = icmp sgt i32 %select3, %load5
+  %select4 = select i1 %cmp4, i32 %select3, i32 %load5
+
+  %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %cmp5 = icmp sgt i32 %select4, %load6
+  %select5 = select i1 %cmp5, i32 %select4, i32 %load6
+
+  %store-select = select i1 %cmp1, i32 3, i32 4
+  store i32 %store-select, i32* @var, align 8
+  ret i32 %select5
+}
+
+define i64 @sminv6() {
+; GFX9-LABEL: @sminv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([32 x i64]* @arr64 to <2 x i64>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4
+; GFX9-NEXT:    store i64 [[STORE_SELECT]], i64* @var64, align 8
+; GFX9-NEXT:    ret i64 [[OP_EXTRA]]
+;
+  %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16
+  %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8
+  %cmp1 = icmp slt i64 %load1, %load2
+  %select1 = select i1 %cmp1, i64 %load1, i64 %load2
+
+  %load3 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2), align 16
+  %cmp2 = icmp slt i64 %select1, %load3
+  %select2 = select i1 %cmp2, i64 %select1, i64 %load3
+
+  %load4 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 3), align 8
+  %cmp3 = icmp slt i64 %select2, %load4
+  %select3 = select i1 %cmp3, i64 %select2, i64 %load4
+
+  %load5 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16
+  %cmp4 = icmp slt i64 %select3, %load5
+  %select4 = select i1 %cmp4, i64 %select3, i64 %load5
+
+  %load6 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8
+  %cmp5 = icmp slt i64 %select4, %load6
+  %select5 = select i1 %cmp5, i64 %select4, i64 %load6
+
+  %store-select = select i1 %cmp1, i64 3, i64 4
+  store i64 %store-select, i64* @var64, align 8
+  ret i64 %select5
+}
+
+define float @fmaxv6() {
+; GFX9-LABEL: @fmaxv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00
+; GFX9-NEXT:    store float [[STORE_SELECT]], float* @fvar, align 8
+; GFX9-NEXT:    ret float [[OP_EXTRA]]
+;
+  %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16
+  %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4
+  %cmp1 = fcmp fast ogt float %load1, %load2
+  %select1 = select i1 %cmp1, float %load1, float %load2
+
+  %load3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2), align 8
+  %cmp2 = fcmp fast ogt float %select1, %load3
+  %select2 = select i1 %cmp2, float %select1, float %load3
+
+  %load4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 3), align 4
+  %cmp3 = fcmp fast ogt float %select2, %load4
+  %select3 = select i1 %cmp3, float %select2, float %load4
+
+  %load5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16
+  %cmp4 = fcmp fast ogt float %select3, %load5
+  %select4 = select i1 %cmp4, float %select3, float %load5
+
+  %load6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4
+  %cmp5 = fcmp fast ogt float %select4, %load6
+  %select5 = select i1 %cmp5, float %select4, float %load6
+
+  %store-select = select i1 %cmp1, float 3.0, float 4.0
+  store float %store-select, float* @fvar, align 8
+  ret float %select5
+}
+
+define double @dminv6() {
+; GFX9-LABEL: @dminv6(
+; GFX9-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16
+; GFX9-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; GFX9-NEXT:    [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]]
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]]
+; GFX9-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]]
+; GFX9-NEXT:    [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00
+; GFX9-NEXT:    store double [[STORE_SELECT]], double* @dvar, align 8
+; GFX9-NEXT:    ret double [[OP_EXTRA]]
+;
+  %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16
+  %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4
+  %cmp1 = fcmp fast olt double %load1, %load2
+  %select1 = select i1 %cmp1, double %load1, double %load2
+
+  %load3 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2), align 8
+  %cmp2 = fcmp fast olt double %select1, %load3
+  %select2 = select i1 %cmp2, double %select1, double %load3
+
+  %load4 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 3), align 4
+  %cmp3 = fcmp fast olt double %select2, %load4
+  %select3 = select i1 %cmp3, double %select2, double %load4
+
+  %load5 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16
+  %cmp4 = fcmp fast olt double %select3, %load5
+  %select4 = select i1 %cmp4, double %select3, double %load5
+
+  %load6 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4
+  %cmp5 = fcmp fast olt double %select4, %load6
+  %select5 = select i1 %cmp5, double %select4, double %load6
+
+  %store-select = select i1 %cmp1, double 3.0, double 4.0
+  store double %store-select, double* @dvar, align 8
+  ret double %select5
+}
+
+define i32 @smax_wdiff_valuenum(i32, i32 %v1) {
+; GFX9-LABEL: @smax_wdiff_valuenum(
+; GFX9-NEXT:    [[VLOAD:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+; GFX9-NEXT:    [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
+; GFX9-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]]
+; GFX9-NEXT:    [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0
+; GFX9-NEXT:    [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]]
+; GFX9-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]]
+; GFX9-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[SELECT1]]
+; GFX9-NEXT:    [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4
+; GFX9-NEXT:    store i32 [[STOREVAL]], i32* @var, align 8
+; GFX9-NEXT:    ret i32 [[OP_EXTRA]]
+;
+  %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+  %elt1 = extractelement <2 x i32> %vload, i32 0
+  %cmp1 = icmp sgt i32 %elt1, %v1
+  %ex0 = extractelement <2 x i32> %vload, i32 0
+  %select1 = select i1 %cmp1, i32 %ex0, i32 %v1
+
+  %load3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %cmp2 = icmp sgt i32 %select1, %load3
+  %select2 = select i1 %cmp2, i32 %select1, i32 %load3
+
+  %load4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %cmp3 = icmp sgt i32 %select2, %load4
+  %select3 = select i1 %cmp3, i32 %select2, i32 %load4
+
+  %load5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %cmp4 = icmp sgt i32 %select3, %load5
+  %select4 = select i1 %cmp4, i32 %select3, i32 %load5
+
+  %load6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %cmp5 = icmp sgt i32 %select4, %load6
+  %select5 = select i1 %cmp5, i32 %select4, i32 %load6
+
+  %storeval = select i1 %cmp1, i32 3, i32 4
+  store i32 %storeval, i32* @var, align 8
+  ret i32 %select5
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg
new file mode 100644
index 00000000000..6baccf05fff
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
new file mode 100644
index 00000000000..55905a4c444
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll
@@ -0,0 +1,203 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI,GFX89 %s
+
+; FIXME: Should still like to vectorize the memory operations for VI
+
+; Simple 3-pair chain with loads and stores
+; GCN-LABEL: @test1_as_3_3_3_v2f16(
+; GFX89: load <2 x half>, <2 x half> addrspace(3)*
+; GFX89: load <2 x half>, <2 x half> addrspace(3)*
+; GFX89: fmul <2 x half>
+; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX89: ret
+define amdgpu_kernel void @test1_as_3_3_3_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_as_3_0_0(
+; GFX89: load <2 x half>, <2 x half> addrspace(3)*
+; GFX89: load <2 x half>, <2 x half>*
+; GFX89: fmul <2 x half>
+; GFX89: store <2 x half> %{{.*}}, <2 x half>* %
+; GFX89: ret
+define amdgpu_kernel void @test1_as_3_0_0(half addrspace(3)* %a, half* %b, half* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+  %i4 = load half, half* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half* %c, i64 1
+  store half %mul5, half* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_as_0_0_3_v2f16(
+; GFX89: load <2 x half>, <2 x half>*
+; GFX89: load <2 x half>, <2 x half>*
+; GFX89: fmul <2 x half>
+; GFX89: store <2 x half> %{{.*}}, <2 x half> addrspace(3)* %
+; GFX89: ret
+define amdgpu_kernel void @test1_as_0_0_3_v2f16(half* %a, half* %b, half addrspace(3)* %c) {
+  %i0 = load half, half* %a, align 2
+  %i1 = load half, half* %b, align 2
+  %mul = fmul half %i0, %i1
+  %arrayidx3 = getelementptr inbounds half, half* %a, i64 1
+  %i3 = load half, half* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half* %b, i64 1
+  %i4 = load half, half* %arrayidx4, align 2
+  %mul5 = fmul half %i3, %i4
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %fma0 = call half @llvm.fma.f16(half %i0, half %i1, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; GCN-LABEL: @mul_scalar_v2f16(
+; GFX9: load <2 x half>
+; GFX9: fmul <2 x half>
+; GFX9: store <2 x half>
+define amdgpu_kernel void @mul_scalar_v2f16(half addrspace(3)* %a, half %scalar, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %mul = fmul half %i0, %scalar
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %mul5 = fmul half %i3, %scalar
+  store half %mul, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %mul5, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @fabs_v2f16
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @fabs_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %fabs0 = call half @llvm.fabs.f16(half %i0)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %fabs1 = call half @llvm.fabs.f16(half %i3)
+  store half %fabs0, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %fabs1, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+; GCN-LABEL: @test1_fabs_fma_v2f16(
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fabs.v2f16(
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %i0.fabs = call half @llvm.fabs.f16(half %i0)
+
+  %fma0 = call half @llvm.fma.f16(half %i0.fabs, half %i1, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %i3.fabs = call half @llvm.fabs.f16(half %i3)
+
+  %fma1 = call half @llvm.fma.f16(half %i3.fabs, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; FIXME: Should do vector load and extract component for fabs
+; GCN-LABEL: @test1_fabs_scalar_fma_v2f16(
+; GFX9: load half
+; GFX9: call half @llvm.fabs.f16(
+; GFX9: load <2 x half>
+; GFX9: load half
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.fma.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @test1_fabs_scalar_fma_v2f16(half addrspace(3)* %a, half addrspace(3)* %b, half addrspace(3)* %c, half addrspace(3)* %d) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %i1 = load half, half addrspace(3)* %b, align 2
+  %i2 = load half, half addrspace(3)* %c, align 2
+  %i1.fabs = call half @llvm.fabs.f16(half %i1)
+
+  %fma0 = call half @llvm.fma.f16(half %i0, half %i1.fabs, half %i2)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %arrayidx4 = getelementptr inbounds half, half addrspace(3)* %b, i64 1
+  %i4 = load half, half addrspace(3)* %arrayidx4, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  %i5 = load half, half addrspace(3)* %arrayidx5, align 2
+  %fma1 = call half @llvm.fma.f16(half %i3, half %i4, half %i5)
+  store half %fma0, half addrspace(3)* %d, align 2
+  %arrayidx6 = getelementptr inbounds half, half addrspace(3)* %d, i64 1
+  store half %fma1, half addrspace(3)* %arrayidx6, align 2
+  ret void
+}
+
+; GCN-LABEL: @canonicalize_v2f16
+; GFX9: load <2 x half>
+; GFX9: call <2 x half> @llvm.canonicalize.v2f16(
+; GFX9: store <2 x half>
+define amdgpu_kernel void @canonicalize_v2f16(half addrspace(3)* %a, half addrspace(3)* %c) {
+  %i0 = load half, half addrspace(3)* %a, align 2
+  %canonicalize0 = call half @llvm.canonicalize.f16(half %i0)
+  %arrayidx3 = getelementptr inbounds half, half addrspace(3)* %a, i64 1
+  %i3 = load half, half addrspace(3)* %arrayidx3, align 2
+  %canonicalize1 = call half @llvm.canonicalize.f16(half %i3)
+  store half %canonicalize0, half addrspace(3)* %c, align 2
+  %arrayidx5 = getelementptr inbounds half, half addrspace(3)* %c, i64 1
+  store half %canonicalize1, half addrspace(3)* %arrayidx5, align 2
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.fma.f16(half, half, half) #1
+declare half @llvm.canonicalize.f16(half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
new file mode 100644
index 00000000000..d7434394dcc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@@ -0,0 +1,722 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer -dce < %s | FileCheck -check-prefixes=GCN,VI %s
+
+define half @reduction_half4(<4 x half> %a) {
+; GFX9-LABEL: @reduction_half4(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x half> [[A]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[BIN_RDX]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x half> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[BIN_RDX2]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_half4(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
+; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
+; VI-NEXT:    ret half [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %a, i64 0
+  %elt1 = extractelement <4 x half> %a, i64 1
+  %elt2 = extractelement <4 x half> %a, i64 2
+  %elt3 = extractelement <4 x half> %a, i64 3
+
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+
+  ret half %add3
+}
+
+define half @reduction_half8(<8 x half> %vec8) {
+; GFX9-LABEL: @reduction_half8(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x half> [[VEC8:%.*]], <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x half> [[VEC8]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x half> [[BIN_RDX]], <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x half> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x half> [[BIN_RDX2]], <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> [[BIN_RDX4]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_half8(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x half> [[VEC8:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
+; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
+; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
+; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
+; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
+; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
+; VI-NEXT:    ret half [[ADD7]]
+;
+entry:
+  %elt0 = extractelement <8 x half> %vec8, i64 0
+  %elt1 = extractelement <8 x half> %vec8, i64 1
+  %elt2 = extractelement <8 x half> %vec8, i64 2
+  %elt3 = extractelement <8 x half> %vec8, i64 3
+  %elt4 = extractelement <8 x half> %vec8, i64 4
+  %elt5 = extractelement <8 x half> %vec8, i64 5
+  %elt6 = extractelement <8 x half> %vec8, i64 6
+  %elt7 = extractelement <8 x half> %vec8, i64 7
+
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+  %add4 = fadd fast half %elt4, %add3
+  %add5 = fadd fast half %elt5, %add4
+  %add6 = fadd fast half %elt6, %add5
+  %add7 = fadd fast half %elt7, %add6
+
+  ret half %add7
+}
+
+define half @reduction_half16(<16 x half> %vec16) {
+; GFX9-LABEL: @reduction_half16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x half> [[VEC16]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x half> [[BIN_RDX]], <16 x half> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x half> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x half> [[BIN_RDX2]], <16 x half> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x half> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GFX9-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x half> [[BIN_RDX4]], <16 x half> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x half> [[BIN_RDX4]], [[RDX_SHUF5]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> [[BIN_RDX6]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_half16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
+; VI-NEXT:    [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8
+; VI-NEXT:    [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
+; VI-NEXT:    [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
+; VI-NEXT:    [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
+; VI-NEXT:    [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
+; VI-NEXT:    [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
+; VI-NEXT:    [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
+; VI-NEXT:    [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
+; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
+; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
+; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
+; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
+; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
+; VI-NEXT:    [[ADD8:%.*]] = fadd fast half [[ELT8]], [[ADD7]]
+; VI-NEXT:    [[ADD9:%.*]] = fadd fast half [[ELT9]], [[ADD8]]
+; VI-NEXT:    [[ADD10:%.*]] = fadd fast half [[ELT10]], [[ADD9]]
+; VI-NEXT:    [[ADD11:%.*]] = fadd fast half [[ELT11]], [[ADD10]]
+; VI-NEXT:    [[ADD12:%.*]] = fadd fast half [[ELT12]], [[ADD11]]
+; VI-NEXT:    [[ADD13:%.*]] = fadd fast half [[ELT13]], [[ADD12]]
+; VI-NEXT:    [[ADD14:%.*]] = fadd fast half [[ELT14]], [[ADD13]]
+; VI-NEXT:    [[ADD15:%.*]] = fadd fast half [[ELT15]], [[ADD14]]
+; VI-NEXT:    ret half [[ADD15]]
+;
+entry:
+  %elt0 = extractelement <16 x half> %vec16, i64 0
+  %elt1 = extractelement <16 x half> %vec16, i64 1
+  %elt2 = extractelement <16 x half> %vec16, i64 2
+  %elt3 = extractelement <16 x half> %vec16, i64 3
+  %elt4 = extractelement <16 x half> %vec16, i64 4
+  %elt5 = extractelement <16 x half> %vec16, i64 5
+  %elt6 = extractelement <16 x half> %vec16, i64 6
+  %elt7 = extractelement <16 x half> %vec16, i64 7
+  %elt8 = extractelement <16 x half> %vec16, i64 8
+  %elt9 = extractelement <16 x half> %vec16, i64 9
+  %elt10 = extractelement <16 x half> %vec16, i64 10
+  %elt11 = extractelement <16 x half> %vec16, i64 11
+  %elt12 = extractelement <16 x half> %vec16, i64 12
+  %elt13 = extractelement <16 x half> %vec16, i64 13
+  %elt14 = extractelement <16 x half> %vec16, i64 14
+  %elt15 = extractelement <16 x half> %vec16, i64 15
+
+  %add1 = fadd fast half %elt1, %elt0
+  %add2 = fadd fast half %elt2, %add1
+  %add3 = fadd fast half %elt3, %add2
+  %add4 = fadd fast half %elt4, %add3
+  %add5 = fadd fast half %elt5, %add4
+  %add6 = fadd fast half %elt6, %add5
+  %add7 = fadd fast half %elt7, %add6
+  %add8 = fadd fast half %elt8, %add7
+  %add9 = fadd fast half %elt9, %add8
+  %add10 = fadd fast half %elt10, %add9
+  %add11 = fadd fast half %elt11, %add10
+  %add12 = fadd fast half %elt12, %add11
+  %add13 = fadd fast half %elt13, %add12
+  %add14 = fadd fast half %elt14, %add13
+  %add15 = fadd fast half %elt15, %add14
+
+  ret half %add15
+}
+
+; FIXME: support vectorization;
+define half @reduction_sub_half4(<4 x half> %a) {
+; GCN-LABEL: @reduction_sub_half4(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
+; GCN-NEXT:    [[ADD1:%.*]] = fsub fast half [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[ADD2:%.*]] = fsub fast half [[ELT2]], [[ADD1]]
+; GCN-NEXT:    [[ADD3:%.*]] = fsub fast half [[ELT3]], [[ADD2]]
+; GCN-NEXT:    ret half [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %a, i64 0
+  %elt1 = extractelement <4 x half> %a, i64 1
+  %elt2 = extractelement <4 x half> %a, i64 2
+  %elt3 = extractelement <4 x half> %a, i64 3
+
+  %add1 = fsub fast half %elt1, %elt0
+  %add2 = fsub fast half %elt2, %add1
+  %add3 = fsub fast half %elt3, %add2
+
+  ret half %add3
+}
+
+define i16 @reduction_v4i16(<4 x i16> %a) {
+; GFX9-LABEL: @reduction_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = add <4 x i16> [[A]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[BIN_RDX]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX2]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[A]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[A]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[A]], i64 3
+; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
+; VI-NEXT:    ret i16 [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %a, i64 0
+  %elt1 = extractelement <4 x i16> %a, i64 1
+  %elt2 = extractelement <4 x i16> %a, i64 2
+  %elt3 = extractelement <4 x i16> %a, i64 3
+
+  %add1 = add i16 %elt1, %elt0
+  %add2 = add i16 %elt2, %add1
+  %add3 = add i16 %elt3, %add2
+
+  ret i16 %add3
+}
+
+define i16 @reduction_v8i16(<8 x i16> %vec8) {
+; GFX9-LABEL: @reduction_v8i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = add <8 x i16> [[VEC8]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i16> [[BIN_RDX]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX2]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i16> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[BIN_RDX4]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_v8i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
+; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
+; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
+; VI-NEXT:    [[ADD4:%.*]] = add i16 [[ELT4]], [[ADD3]]
+; VI-NEXT:    [[ADD5:%.*]] = add i16 [[ELT5]], [[ADD4]]
+; VI-NEXT:    [[ADD6:%.*]] = add i16 [[ELT6]], [[ADD5]]
+; VI-NEXT:    [[ADD7:%.*]] = add i16 [[ELT7]], [[ADD6]]
+; VI-NEXT:    ret i16 [[ADD7]]
+;
+entry:
+  %elt0 = extractelement <8 x i16> %vec8, i64 0
+  %elt1 = extractelement <8 x i16> %vec8, i64 1
+  %elt2 = extractelement <8 x i16> %vec8, i64 2
+  %elt3 = extractelement <8 x i16> %vec8, i64 3
+  %elt4 = extractelement <8 x i16> %vec8, i64 4
+  %elt5 = extractelement <8 x i16> %vec8, i64 5
+  %elt6 = extractelement <8 x i16> %vec8, i64 6
+  %elt7 = extractelement <8 x i16> %vec8, i64 7
+
+  %add1 = add i16 %elt1, %elt0
+  %add2 = add i16 %elt2, %add1
+  %add3 = add i16 %elt3, %add2
+  %add4 = add i16 %elt4, %add3
+  %add5 = add i16 %elt5, %add4
+  %add6 = add i16 %elt6, %add5
+  %add7 = add i16 %elt7, %add6
+
+  ret i16 %add7
+}
+
+define i16 @reduction_umin_v4i16(<4 x i16> %vec4) {
+; GFX9-LABEL: @reduction_umin_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i16> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_umin_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MIN1]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MIN2]]
+; VI-NEXT:    ret i16 [[MIN3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %vec4, i64 0
+  %elt1 = extractelement <4 x i16> %vec4, i64 1
+  %elt2 = extractelement <4 x i16> %vec4, i64 2
+  %elt3 = extractelement <4 x i16> %vec4, i64 3
+
+  %cmp1 = icmp ult i16 %elt1, %elt0
+  %min1 = select i1 %cmp1, i16 %elt1, i16 %elt0
+  %cmp2 = icmp ult i16 %elt2, %min1
+  %min2 = select i1 %cmp2, i16 %elt2, i16 %min1
+  %cmp3 = icmp ult i16 %elt3, %min2
+  %min3 = select i1 %cmp3, i16 %elt3, i16 %min2
+
+  ret i16 %min3
+}
+
+define i16 @reduction_icmp_v8i16(<8 x i16> %vec8) {
+; GFX9-LABEL: @reduction_icmp_v8i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i16> [[VEC8]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i16> [[VEC8]], <8 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> [[RDX_SHUF4]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <8 x i16> [[RDX_MINMAX_SELECT6]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_icmp_v8i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
+; VI-NEXT:    [[CMP0:%.*]] = icmp ult i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp ult i16 [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp ult i16 [[ELT4]], [[MIN3]]
+; VI-NEXT:    [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]]
+; VI-NEXT:    [[CMP4:%.*]] = icmp ult i16 [[ELT5]], [[MIN4]]
+; VI-NEXT:    [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]]
+; VI-NEXT:    [[CMP5:%.*]] = icmp ult i16 [[ELT6]], [[MIN5]]
+; VI-NEXT:    [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]]
+; VI-NEXT:    [[CMP6:%.*]] = icmp ult i16 [[ELT7]], [[MIN6]]
+; VI-NEXT:    [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]]
+; VI-NEXT:    ret i16 [[MIN7]]
+;
+entry:
+  %elt0 = extractelement <8 x i16> %vec8, i64 0
+  %elt1 = extractelement <8 x i16> %vec8, i64 1
+  %elt2 = extractelement <8 x i16> %vec8, i64 2
+  %elt3 = extractelement <8 x i16> %vec8, i64 3
+  %elt4 = extractelement <8 x i16> %vec8, i64 4
+  %elt5 = extractelement <8 x i16> %vec8, i64 5
+  %elt6 = extractelement <8 x i16> %vec8, i64 6
+  %elt7 = extractelement <8 x i16> %vec8, i64 7
+
+  %cmp0 = icmp ult i16 %elt1, %elt0
+  %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
+  %cmp1 = icmp ult i16 %elt2, %min1
+  %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
+  %cmp2 = icmp ult i16 %elt3, %min2
+  %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
+
+  %cmp3 = icmp ult i16 %elt4, %min3
+  %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
+  %cmp4 = icmp ult i16 %elt5, %min4
+  %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
+
+  %cmp5 = icmp ult i16 %elt6, %min5
+  %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
+  %cmp6 = icmp ult i16 %elt7, %min6
+  %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
+
+  ret i16 %min7
+}
+
+define i16 @reduction_smin_v16i16(<16 x i16> %vec16) {
+; GFX9-LABEL: @reduction_smin_v16i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i16> [[VEC16:%.*]], <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <16 x i16> [[VEC16]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i16> [[VEC16]], <16 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> [[RDX_SHUF4]]
+; GFX9-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> [[RDX_SHUF7]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <16 x i16> [[RDX_MINMAX_SELECT9]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_smin_v16i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <16 x i16> [[VEC16:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <16 x i16> [[VEC16]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <16 x i16> [[VEC16]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <16 x i16> [[VEC16]], i64 3
+; VI-NEXT:    [[ELT4:%.*]] = extractelement <16 x i16> [[VEC16]], i64 4
+; VI-NEXT:    [[ELT5:%.*]] = extractelement <16 x i16> [[VEC16]], i64 5
+; VI-NEXT:    [[ELT6:%.*]] = extractelement <16 x i16> [[VEC16]], i64 6
+; VI-NEXT:    [[ELT7:%.*]] = extractelement <16 x i16> [[VEC16]], i64 7
+; VI-NEXT:    [[ELT8:%.*]] = extractelement <16 x i16> [[VEC16]], i64 8
+; VI-NEXT:    [[ELT9:%.*]] = extractelement <16 x i16> [[VEC16]], i64 9
+; VI-NEXT:    [[ELT10:%.*]] = extractelement <16 x i16> [[VEC16]], i64 10
+; VI-NEXT:    [[ELT11:%.*]] = extractelement <16 x i16> [[VEC16]], i64 11
+; VI-NEXT:    [[ELT12:%.*]] = extractelement <16 x i16> [[VEC16]], i64 12
+; VI-NEXT:    [[ELT13:%.*]] = extractelement <16 x i16> [[VEC16]], i64 13
+; VI-NEXT:    [[ELT14:%.*]] = extractelement <16 x i16> [[VEC16]], i64 14
+; VI-NEXT:    [[ELT15:%.*]] = extractelement <16 x i16> [[VEC16]], i64 15
+; VI-NEXT:    [[CMP0:%.*]] = icmp slt i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP0]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP1:%.*]] = icmp slt i16 [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP1]], i16 [[ELT2]], i16 [[MIN1]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp slt i16 [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP2]], i16 [[ELT3]], i16 [[MIN2]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp slt i16 [[ELT4]], [[MIN3]]
+; VI-NEXT:    [[MIN4:%.*]] = select i1 [[CMP3]], i16 [[ELT4]], i16 [[MIN3]]
+; VI-NEXT:    [[CMP4:%.*]] = icmp slt i16 [[ELT5]], [[MIN4]]
+; VI-NEXT:    [[MIN5:%.*]] = select i1 [[CMP4]], i16 [[ELT5]], i16 [[MIN4]]
+; VI-NEXT:    [[CMP5:%.*]] = icmp slt i16 [[ELT6]], [[MIN5]]
+; VI-NEXT:    [[MIN6:%.*]] = select i1 [[CMP5]], i16 [[ELT6]], i16 [[MIN5]]
+; VI-NEXT:    [[CMP6:%.*]] = icmp slt i16 [[ELT7]], [[MIN6]]
+; VI-NEXT:    [[MIN7:%.*]] = select i1 [[CMP6]], i16 [[ELT7]], i16 [[MIN6]]
+; VI-NEXT:    [[CMP7:%.*]] = icmp slt i16 [[ELT8]], [[MIN7]]
+; VI-NEXT:    [[MIN8:%.*]] = select i1 [[CMP7]], i16 [[ELT8]], i16 [[MIN7]]
+; VI-NEXT:    [[CMP8:%.*]] = icmp slt i16 [[ELT9]], [[MIN8]]
+; VI-NEXT:    [[MIN9:%.*]] = select i1 [[CMP8]], i16 [[ELT9]], i16 [[MIN8]]
+; VI-NEXT:    [[CMP9:%.*]] = icmp slt i16 [[ELT10]], [[MIN9]]
+; VI-NEXT:    [[MIN10:%.*]] = select i1 [[CMP9]], i16 [[ELT10]], i16 [[MIN9]]
+; VI-NEXT:    [[CMP10:%.*]] = icmp slt i16 [[ELT11]], [[MIN10]]
+; VI-NEXT:    [[MIN11:%.*]] = select i1 [[CMP10]], i16 [[ELT11]], i16 [[MIN10]]
+; VI-NEXT:    [[CMP11:%.*]] = icmp slt i16 [[ELT12]], [[MIN11]]
+; VI-NEXT:    [[MIN12:%.*]] = select i1 [[CMP11]], i16 [[ELT12]], i16 [[MIN11]]
+; VI-NEXT:    [[CMP12:%.*]] = icmp slt i16 [[ELT13]], [[MIN12]]
+; VI-NEXT:    [[MIN13:%.*]] = select i1 [[CMP12]], i16 [[ELT13]], i16 [[MIN12]]
+; VI-NEXT:    [[CMP13:%.*]] = icmp slt i16 [[ELT14]], [[MIN13]]
+; VI-NEXT:    [[MIN14:%.*]] = select i1 [[CMP13]], i16 [[ELT14]], i16 [[MIN13]]
+; VI-NEXT:    [[CMP14:%.*]] = icmp slt i16 [[ELT15]], [[MIN14]]
+; VI-NEXT:    [[MIN15:%.*]] = select i1 [[CMP14]], i16 [[ELT15]], i16 [[MIN14]]
+; VI-NEXT:    ret i16 [[MIN15]]
+;
+entry:
+  %elt0 = extractelement <16 x i16> %vec16, i64 0
+  %elt1 = extractelement <16 x i16> %vec16, i64 1
+  %elt2 = extractelement <16 x i16> %vec16, i64 2
+  %elt3 = extractelement <16 x i16> %vec16, i64 3
+  %elt4 = extractelement <16 x i16> %vec16, i64 4
+  %elt5 = extractelement <16 x i16> %vec16, i64 5
+  %elt6 = extractelement <16 x i16> %vec16, i64 6
+  %elt7 = extractelement <16 x i16> %vec16, i64 7
+
+  %elt8 = extractelement <16 x i16> %vec16, i64 8
+  %elt9 = extractelement <16 x i16> %vec16, i64 9
+  %elt10 = extractelement <16 x i16> %vec16, i64 10
+  %elt11 = extractelement <16 x i16> %vec16, i64 11
+  %elt12 = extractelement <16 x i16> %vec16, i64 12
+  %elt13 = extractelement <16 x i16> %vec16, i64 13
+  %elt14 = extractelement <16 x i16> %vec16, i64 14
+  %elt15 = extractelement <16 x i16> %vec16, i64 15
+
+  %cmp0 = icmp slt i16 %elt1, %elt0
+  %min1 = select i1 %cmp0, i16 %elt1, i16 %elt0
+  %cmp1 = icmp slt i16 %elt2, %min1
+  %min2 = select i1 %cmp1, i16 %elt2, i16 %min1
+  %cmp2 = icmp slt i16 %elt3, %min2
+  %min3 = select i1 %cmp2, i16 %elt3, i16 %min2
+
+  %cmp3 = icmp slt i16 %elt4, %min3
+  %min4 = select i1 %cmp3, i16 %elt4, i16 %min3
+  %cmp4 = icmp slt i16 %elt5, %min4
+  %min5 = select i1 %cmp4, i16 %elt5, i16 %min4
+
+  %cmp5 = icmp slt i16 %elt6, %min5
+  %min6 = select i1 %cmp5, i16 %elt6, i16 %min5
+  %cmp6 = icmp slt i16 %elt7, %min6
+  %min7 = select i1 %cmp6, i16 %elt7, i16 %min6
+
+  %cmp7 = icmp slt i16 %elt8, %min7
+  %min8 = select i1 %cmp7, i16 %elt8, i16 %min7
+  %cmp8 = icmp slt i16 %elt9, %min8
+  %min9 = select i1 %cmp8, i16 %elt9, i16 %min8
+
+  %cmp9 = icmp slt i16 %elt10, %min9
+  %min10 = select i1 %cmp9, i16 %elt10, i16 %min9
+  %cmp10 = icmp slt i16 %elt11, %min10
+  %min11 = select i1 %cmp10, i16 %elt11, i16 %min10
+
+  %cmp11 = icmp slt i16 %elt12, %min11
+  %min12 = select i1 %cmp11, i16 %elt12, i16 %min11
+  %cmp12 = icmp slt i16 %elt13, %min12
+  %min13 = select i1 %cmp12, i16 %elt13, i16 %min12
+
+  %cmp13 = icmp slt i16 %elt14, %min13
+  %min14 = select i1 %cmp13, i16 %elt14, i16 %min13
+  %cmp14 = icmp slt i16 %elt15, %min14
+  %min15 = select i1 %cmp14, i16 %elt15, i16 %min14
+
+
+  ret i16 %min15
+}
+
+define i16 @reduction_umax_v4i16(<4 x i16> %vec4) {
+; GFX9-LABEL: @reduction_umax_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i16> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ugt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_umax_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = icmp ugt i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp ugt i16 [[ELT2]], [[MAX1]]
+; VI-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp ugt i16 [[ELT3]], [[MAX2]]
+; VI-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]]
+; VI-NEXT:    ret i16 [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %vec4, i64 0
+  %elt1 = extractelement <4 x i16> %vec4, i64 1
+  %elt2 = extractelement <4 x i16> %vec4, i64 2
+  %elt3 = extractelement <4 x i16> %vec4, i64 3
+
+  %cmp1 = icmp ugt i16 %elt1, %elt0
+  %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0
+  %cmp2 = icmp ugt i16 %elt2, %max1
+  %max2 = select i1 %cmp2, i16 %elt2, i16 %max1
+  %cmp3 = icmp ugt i16 %elt3, %max2
+  %max3 = select i1 %cmp3, i16 %elt3, i16 %max2
+
+  ret i16 %max3
+}
+
+define i16 @reduction_smax_v4i16(<4 x i16> %vec4) {
+; GFX9-LABEL: @reduction_smax_v4i16(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i16> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret i16 [[TMP0]]
+;
+; VI-LABEL: @reduction_smax_v4i16(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = icmp sgt i16 [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], i16 [[ELT1]], i16 [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = icmp sgt i16 [[ELT2]], [[MAX1]]
+; VI-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], i16 [[ELT2]], i16 [[MAX1]]
+; VI-NEXT:    [[CMP3:%.*]] = icmp sgt i16 [[ELT3]], [[MAX2]]
+; VI-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], i16 [[ELT3]], i16 [[MAX2]]
+; VI-NEXT:    ret i16 [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x i16> %vec4, i64 0
+  %elt1 = extractelement <4 x i16> %vec4, i64 1
+  %elt2 = extractelement <4 x i16> %vec4, i64 2
+  %elt3 = extractelement <4 x i16> %vec4, i64 3
+
+  %cmp1 = icmp sgt i16 %elt1, %elt0
+  %max1 = select i1 %cmp1, i16 %elt1, i16 %elt0
+  %cmp2 = icmp sgt i16 %elt2, %max1
+  %max2 = select i1 %cmp2, i16 %elt2, i16 %max1
+  %cmp3 = icmp sgt i16 %elt3, %max2
+  %max3 = select i1 %cmp3, i16 %elt3, i16 %max2
+
+  ret i16 %max3
+}
+
+define half @reduction_fmax_v4half(<4 x half> %vec4) {
+; GFX9-LABEL: @reduction_fmax_v4half(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x half> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_fmax_v4half(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = fcmp fast ogt half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MAX1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = fcmp fast ogt half [[ELT2]], [[MAX1]]
+; VI-NEXT:    [[MAX2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MAX1]]
+; VI-NEXT:    [[CMP3:%.*]] = fcmp fast ogt half [[ELT3]], [[MAX2]]
+; VI-NEXT:    [[MAX3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MAX2]]
+; VI-NEXT:    ret half [[MAX3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %vec4, i64 0
+  %elt1 = extractelement <4 x half> %vec4, i64 1
+  %elt2 = extractelement <4 x half> %vec4, i64 2
+  %elt3 = extractelement <4 x half> %vec4, i64 3
+
+  %cmp1 = fcmp fast ogt half %elt1, %elt0
+  %max1 = select i1 %cmp1, half %elt1, half %elt0
+  %cmp2 = fcmp fast ogt half %elt2, %max1
+  %max2 = select i1 %cmp2, half %elt2, half %max1
+  %cmp3 = fcmp fast ogt half %elt3, %max2
+  %max3 = select i1 %cmp3, half %elt3, half %max2
+
+  ret half %max3
+}
+
+define half @reduction_fmin_v4half(<4 x half> %vec4) {
+; GFX9-LABEL: @reduction_fmin_v4half(
+; GFX9-NEXT:  entry:
+; GFX9-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[VEC4:%.*]], <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x half> [[VEC4]], [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x half> [[VEC4]], <4 x half> [[RDX_SHUF]]
+; GFX9-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; GFX9-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x half> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; GFX9-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x half> [[RDX_MINMAX_SELECT]], <4 x half> [[RDX_SHUF1]]
+; GFX9-NEXT:    [[TMP0:%.*]] = extractelement <4 x half> [[RDX_MINMAX_SELECT3]], i32 0
+; GFX9-NEXT:    ret half [[TMP0]]
+;
+; VI-LABEL: @reduction_fmin_v4half(
+; VI-NEXT:  entry:
+; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[VEC4:%.*]], i64 0
+; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[VEC4]], i64 1
+; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[VEC4]], i64 2
+; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[VEC4]], i64 3
+; VI-NEXT:    [[CMP1:%.*]] = fcmp fast olt half [[ELT1]], [[ELT0]]
+; VI-NEXT:    [[MIN1:%.*]] = select i1 [[CMP1]], half [[ELT1]], half [[ELT0]]
+; VI-NEXT:    [[CMP2:%.*]] = fcmp fast olt half [[ELT2]], [[MIN1]]
+; VI-NEXT:    [[MIN2:%.*]] = select i1 [[CMP2]], half [[ELT2]], half [[MIN1]]
+; VI-NEXT:    [[CMP3:%.*]] = fcmp fast olt half [[ELT3]], [[MIN2]]
+; VI-NEXT:    [[MIN3:%.*]] = select i1 [[CMP3]], half [[ELT3]], half [[MIN2]]
+; VI-NEXT:    ret half [[MIN3]]
+;
+entry:
+  %elt0 = extractelement <4 x half> %vec4, i64 0
+  %elt1 = extractelement <4 x half> %vec4, i64 1
+  %elt2 = extractelement <4 x half> %vec4, i64 2
+  %elt3 = extractelement <4 x half> %vec4, i64 3
+
+  %cmp1 = fcmp fast olt half %elt1, %elt0
+  %min1 = select i1 %cmp1, half %elt1, half %elt0
+  %cmp2 = fcmp fast olt half %elt2, %min1
+  %min2 = select i1 %cmp2, half %elt2, half %min1
+  %cmp3 = fcmp fast olt half %elt3, %min2
+  %min3 = select i1 %cmp3, half %elt3, half %min2
+
+  ret half %min3
+}
+
+; Tests to make sure reduction does not kick in. vega does not support packed math for types larger than 16 bits.
+define float @reduction_v4float(<4 x float> %a) {
+; GCN-LABEL: @reduction_v4float(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[ELT0:%.*]] = extractelement <4 x float> [[A:%.*]], i64 0
+; GCN-NEXT:    [[ELT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; GCN-NEXT:    [[ELT2:%.*]] = extractelement <4 x float> [[A]], i64 2
+; GCN-NEXT:    [[ELT3:%.*]] = extractelement <4 x float> [[A]], i64 3
+; GCN-NEXT:    [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; GCN-NEXT:    [[ADD2:%.*]] = fadd fast float [[ELT2]], [[ADD1]]
+; GCN-NEXT:    [[ADD3:%.*]] = fadd fast float [[ELT3]], [[ADD2]]
+; GCN-NEXT:    ret float [[ADD3]]
+;
+entry:
+  %elt0 = extractelement <4 x float> %a, i64 0
+  %elt1 = extractelement <4 x float> %a, i64 1
+  %elt2 = extractelement <4 x float> %a, i64 2
+  %elt3 = extractelement <4 x float> %a, i64 3
+
+  %add1 = fadd fast float %elt1, %elt0
+  %add2 = fadd fast float %elt2, %add1
+  %add3 = fadd fast float %elt3, %add2
+
+  ret float %add3
+}
+\ No newline at end of file
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll
new file mode 100644
index 00000000000..5998801705b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/extract-insert.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=thumb7 -mcpu=swift | FileCheck %s
+
+define <4 x i32> @PR13837(<4 x float> %in) {
+; CHECK-LABEL: @PR13837(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x float> [[IN:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %t0 = extractelement <4 x float> %in, i64 0
+  %t1 = extractelement <4 x float> %in, i64 1
+  %t2 = extractelement <4 x float> %in, i64 2
+  %t3 = extractelement <4 x float> %in, i64 3
+  %c0 = fptosi float %t0 to i32
+  %c1 = fptosi float %t1 to i32
+  %c2 = fptosi float %t2 to i32
+  %c3 = fptosi float %t3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %c0, i32 0
+  %v1 = insertelement <4 x i32> %v0, i32 %c1, i32 1
+  %v2 = insertelement <4 x i32> %v1, i32 %c2, i32 2
+  %v3 = insertelement <4 x i32> %v2, i32 %c3, i32 3
+  ret <4 x i32> %v3
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
new file mode 100644
index 00000000000..236e1d34416
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll b/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll
new file mode 100644
index 00000000000..70e87033207
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; On swift unaligned <2 x double> stores need 4uops and it is there for cheaper
+; to do this scalar.
+
+define void @expensive_double_store(double* noalias %dst, double* noalias %src, i64 %count) {
+; CHECK-LABEL: @expensive_double_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[SRC:%.*]], align 8
+; CHECK-NEXT:    store double [[TMP0]], double* [[DST:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST]], i64 1
+; CHECK-NEXT:    store double [[TMP1]], double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load double, double* %src, align 8
+  store double %0, double* %dst, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %src, i64 1
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %dst, i64 1
+  store double %1, double* %arrayidx3, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll b/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll
new file mode 100644
index 00000000000..c43e8f10a59
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mcpu=swift -mtriple=thumbv7-apple-ios -basicaa -slp-vectorizer < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+%class.Complex = type { double, double }
+
+; Code like this is the result of SROA. Make sure we don't vectorize this
+; because the scalar version of the shl/or are handled by the
+; backend and disappear, the vectorized code stays.
+
+define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) {
+; CHECK-LABEL: @SROAed(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE:%.*]], 0
+; CHECK-NEXT:    [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_0_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 1
+; CHECK-NEXT:    [[A_SROA_0_4_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_1_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_SROA_0_4_INSERT_SHIFT:%.*]] = shl nuw i64 [[A_SROA_0_4_INSERT_EXT]], 32
+; CHECK-NEXT:    [[A_SROA_0_4_INSERT_INSERT:%.*]] = or i64 [[A_SROA_0_4_INSERT_SHIFT]], [[A_SROA_0_0_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 [[A_SROA_0_4_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 2
+; CHECK-NEXT:    [[A_SROA_3_8_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_2_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 3
+; CHECK-NEXT:    [[A_SROA_3_12_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_3_EXTRACT]] to i64
+; CHECK-NEXT:    [[A_SROA_3_12_INSERT_SHIFT:%.*]] = shl nuw i64 [[A_SROA_3_12_INSERT_EXT]], 32
+; CHECK-NEXT:    [[A_SROA_3_12_INSERT_INSERT:%.*]] = or i64 [[A_SROA_3_12_INSERT_SHIFT]], [[A_SROA_3_8_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[A_SROA_3_12_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE:%.*]], 0
+; CHECK-NEXT:    [[B_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_0_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 1
+; CHECK-NEXT:    [[B_SROA_0_4_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_1_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_SROA_0_4_INSERT_SHIFT:%.*]] = shl nuw i64 [[B_SROA_0_4_INSERT_EXT]], 32
+; CHECK-NEXT:    [[B_SROA_0_4_INSERT_INSERT:%.*]] = or i64 [[B_SROA_0_4_INSERT_SHIFT]], [[B_SROA_0_0_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[B_SROA_0_4_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 2
+; CHECK-NEXT:    [[B_SROA_3_8_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_2_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 3
+; CHECK-NEXT:    [[B_SROA_3_12_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_3_EXTRACT]] to i64
+; CHECK-NEXT:    [[B_SROA_3_12_INSERT_SHIFT:%.*]] = shl nuw i64 [[B_SROA_3_12_INSERT_EXT]], 32
+; CHECK-NEXT:    [[B_SROA_3_12_INSERT_INSERT:%.*]] = or i64 [[B_SROA_3_12_INSERT_SHIFT]], [[B_SROA_3_8_INSERT_EXT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[B_SROA_3_12_INSERT_INSERT]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd double [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[RE_I_I:%.*]] = getelementptr inbounds [[CLASS_COMPLEX:%.*]], %class.Complex* [[AGG_RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store double [[ADD]], double* [[RE_I_I]], align 4
+; CHECK-NEXT:    [[IM_I_I:%.*]] = getelementptr inbounds [[CLASS_COMPLEX]], %class.Complex* [[AGG_RESULT]], i32 0, i32 1
+; CHECK-NEXT:    store double [[ADD3]], double* [[IM_I_I]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.coerce.fca.0.extract = extractvalue [4 x i32] %a.coerce, 0
+  %a.sroa.0.0.insert.ext = zext i32 %a.coerce.fca.0.extract to i64
+  %a.coerce.fca.1.extract = extractvalue [4 x i32] %a.coerce, 1
+  %a.sroa.0.4.insert.ext = zext i32 %a.coerce.fca.1.extract to i64
+  %a.sroa.0.4.insert.shift = shl nuw i64 %a.sroa.0.4.insert.ext, 32
+  %a.sroa.0.4.insert.insert = or i64 %a.sroa.0.4.insert.shift, %a.sroa.0.0.insert.ext
+  %0 = bitcast i64 %a.sroa.0.4.insert.insert to double
+  %a.coerce.fca.2.extract = extractvalue [4 x i32] %a.coerce, 2
+  %a.sroa.3.8.insert.ext = zext i32 %a.coerce.fca.2.extract to i64
+  %a.coerce.fca.3.extract = extractvalue [4 x i32] %a.coerce, 3
+  %a.sroa.3.12.insert.ext = zext i32 %a.coerce.fca.3.extract to i64
+  %a.sroa.3.12.insert.shift = shl nuw i64 %a.sroa.3.12.insert.ext, 32
+  %a.sroa.3.12.insert.insert = or i64 %a.sroa.3.12.insert.shift, %a.sroa.3.8.insert.ext
+  %1 = bitcast i64 %a.sroa.3.12.insert.insert to double
+  %b.coerce.fca.0.extract = extractvalue [4 x i32] %b.coerce, 0
+  %b.sroa.0.0.insert.ext = zext i32 %b.coerce.fca.0.extract to i64
+  %b.coerce.fca.1.extract = extractvalue [4 x i32] %b.coerce, 1
+  %b.sroa.0.4.insert.ext = zext i32 %b.coerce.fca.1.extract to i64
+  %b.sroa.0.4.insert.shift = shl nuw i64 %b.sroa.0.4.insert.ext, 32
+  %b.sroa.0.4.insert.insert = or i64 %b.sroa.0.4.insert.shift, %b.sroa.0.0.insert.ext
+  %2 = bitcast i64 %b.sroa.0.4.insert.insert to double
+  %b.coerce.fca.2.extract = extractvalue [4 x i32] %b.coerce, 2
+  %b.sroa.3.8.insert.ext = zext i32 %b.coerce.fca.2.extract to i64
+  %b.coerce.fca.3.extract = extractvalue [4 x i32] %b.coerce, 3
+  %b.sroa.3.12.insert.ext = zext i32 %b.coerce.fca.3.extract to i64
+  %b.sroa.3.12.insert.shift = shl nuw i64 %b.sroa.3.12.insert.ext, 32
+  %b.sroa.3.12.insert.insert = or i64 %b.sroa.3.12.insert.shift, %b.sroa.3.8.insert.ext
+  %3 = bitcast i64 %b.sroa.3.12.insert.insert to double
+  %add = fadd double %0, %2
+  %add3 = fadd double %1, %3
+  %re.i.i = getelementptr inbounds %class.Complex, %class.Complex* %agg.result, i32 0, i32 0
+  store double %add, double* %re.i.i, align 4
+  %im.i.i = getelementptr inbounds %class.Complex, %class.Complex* %agg.result, i32 0, i32 1
+  store double %add3, double* %im.i.i, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
new file mode 100644
index 00000000000..2cb98eb371b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll
new file mode 100644
index 00000000000..3eae2d0b010
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/non-vectorizable-intrinsic.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000 | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx--nvidiacl"
+
+; CTLZ cannot be vectorized currently because the second argument is a scalar
+; for both the scalar and vector forms of the intrinsic. In the future it
+; should be possible to vectorize such functions.
+; Test causes an assert if LLVM tries to vectorize CTLZ.
+
+define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+; CHECK-LABEL: @cltz_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i8> undef, i8 [[CALL_I]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[VECINIT2]]
+;
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+define <2 x i8> @cltz_test2(<2 x i8> %x) #1 {
+; CHECK-LABEL: @cltz_test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <2 x i8> undef, i8 [[CALL_I]], i32 0
+; CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[VECINIT2]]
+;
+entry:
+  %0 = extractelement <2 x i8> %x, i32 0
+  %1 = extractelement <2 x i8> %x, i32 1
+  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
+  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
+  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
+  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
+  ret <2 x i8> %vecinit2
+}
+
+declare i8 @llvm.ctlz.i8(i8, i1) #3
+
+attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
new file mode 100644
index 00000000000..6c474705abc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR
+
+define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 {
+; CHECK-LABEL: @fusion(
+; CHECK-NEXT:    [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half*
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half*
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], <half 0xH5380, half 0xH5380>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], <half 0xH57F0, half 0xH57F0>
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>*
+; CHECK-NEXT:    store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+; NOVECTOR-LABEL: @fusion(
+; NOVECTOR-NEXT:    [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6
+; NOVECTOR-NEXT:    [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]]
+; NOVECTOR-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2
+; NOVECTOR-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; NOVECTOR-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], 1
+; NOVECTOR-NEXT:    [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half*
+; NOVECTOR-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]]
+; NOVECTOR-NEXT:    [[TMP12:%.*]] = load half, half* [[TMP11]], align 8
+; NOVECTOR-NEXT:    [[TMP13:%.*]] = fmul fast half [[TMP12]], 0xH5380
+; NOVECTOR-NEXT:    [[TMP14:%.*]] = fadd fast half [[TMP13]], 0xH57F0
+; NOVECTOR-NEXT:    [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half*
+; NOVECTOR-NEXT:    [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]]
+; NOVECTOR-NEXT:    store half [[TMP14]], half* [[TMP16]], align 8
+; NOVECTOR-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
+; NOVECTOR-NEXT:    [[TMP18:%.*]] = load half, half* [[TMP17]], align 2
+; NOVECTOR-NEXT:    [[TMP19:%.*]] = fmul fast half [[TMP18]], 0xH5380
+; NOVECTOR-NEXT:    [[TMP20:%.*]] = fadd fast half [[TMP19]], 0xH57F0
+; NOVECTOR-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
+; NOVECTOR-NEXT:    store half [[TMP20]], half* [[TMP21]], align 2
+; NOVECTOR-NEXT:    ret void
+;
+  %tmp = shl nuw nsw i32 %arg2, 6
+  %tmp4 = or i32 %tmp, %arg3
+  %tmp5 = shl nuw nsw i32 %tmp4, 2
+  %tmp6 = zext i32 %tmp5 to i64
+  %tmp7 = or i64 %tmp6, 1
+  %tmp10 = bitcast i8* %arg1 to half*
+  %tmp11 = getelementptr inbounds half, half* %tmp10, i64 %tmp6
+  %tmp12 = load half, half* %tmp11, align 8
+  %tmp13 = fmul fast half %tmp12, 0xH5380
+  %tmp14 = fadd fast half %tmp13, 0xH57F0
+  %tmp15 = bitcast i8* %arg to half*
+  %tmp16 = getelementptr inbounds half, half* %tmp15, i64 %tmp6
+  store half %tmp14, half* %tmp16, align 8
+  %tmp17 = getelementptr inbounds half, half* %tmp10, i64 %tmp7
+  %tmp18 = load half, half* %tmp17, align 2
+  %tmp19 = fmul fast half %tmp18, 0xH5380
+  %tmp20 = fadd fast half %tmp19, 0xH57F0
+  %tmp21 = getelementptr inbounds half, half* %tmp15, i64 %tmp7
+  store half %tmp20, half* %tmp21, align 2
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll b/llvm/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll
new file mode 100644
index 00000000000..99af834e5b8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/PowerPC/aggregate.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s
+
+%struct.S = type { i8*, i8* }
+
+@kS0 = common global %struct.S zeroinitializer, align 8
+
+define { i64, i64 } @getS() {
+; CHECK-LABEL: @getS(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i64, i64 } [[TMP2]], i64 [[TMP1]], 1
+; CHECK-NEXT:    ret { i64, i64 } [[TMP3]]
+;
+entry:
+  %0 = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+  %1 = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+  %2 = insertvalue { i64, i64 } undef, i64 %0, 0
+  %3 = insertvalue { i64, i64 } %2, i64 %1, 1
+  ret { i64, i64 } %3
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000..091332439b1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll b/llvm/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll
new file mode 100644
index 00000000000..7f7df82a38f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s
+
+%struct.A = type { i8*, i8* }
+
+define i64 @foo(%struct.A* nocapture readonly %this) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END_I:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], %struct.A* [[THIS:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8** [[END_I]] to i64*
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.A* [[THIS]] to i64*
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8
+; CHECK-NEXT:    [[SUB_PTR_SUB_I:%.*]] = sub i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[SUB_PTR_SUB_I]], 9
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK:       lor.lhs.false:
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to i8*
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP1]] to i8*
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8* [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP2]], i64 2, i64 -1
+; CHECK-NEXT:    ret i64 [[DOT]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i64 2
+;
+entry:
+  %end.i = getelementptr inbounds %struct.A, %struct.A* %this, i64 0, i32 1
+  %0 = bitcast i8** %end.i to i64*
+  %1 = load i64, i64* %0, align 8
+  %2 = bitcast %struct.A* %this to i64*
+  %3 = load i64, i64* %2, align 8
+  %sub.ptr.sub.i = sub i64 %1, %3
+  %cmp = icmp sgt i64 %sub.ptr.sub.i, 9
+  br i1 %cmp, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+  %4 = inttoptr i64 %3 to i8*
+  %5 = inttoptr i64 %1 to i8*
+  %cmp2 = icmp ugt i8* %5, %4
+  %. = select i1 %cmp2, i64 2, i64 -1
+  ret i64 %.
+
+return:
+  ret i64 2
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll b/llvm/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll
new file mode 100644
index 00000000000..203f47c76d4
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/PowerPC/short-to-double.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P9
+; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P8
+
+%struct._pp = type { i16, i16, i16, i16 }
+
+; Function Attrs: norecurse nounwind readonly
+define [5 x double] @foo(double %k, i64 %n, %struct._pp* nocapture readonly %p) local_unnamed_addr #0 {
+entry:
+  %cmp17 = icmp sgt i64 %n, 0
+  br i1 %cmp17, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %retval.sroa.0.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %retval.sroa.4.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add10, %for.body ]
+  %.fca.0.insert = insertvalue [5 x double] undef, double %retval.sroa.0.0.lcssa, 0
+  %.fca.1.insert = insertvalue [5 x double] %.fca.0.insert, double %retval.sroa.4.0.lcssa, 1
+  ret [5 x double] %.fca.1.insert
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.020 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %retval.sroa.4.019 = phi double [ %add10, %for.body ], [ 0.000000e+00, %entry ]
+  %retval.sroa.0.018 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %r1 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 2
+  %0 = load i16, i16* %r1, align 2
+  %conv2 = uitofp i16 %0 to double
+  %mul = fmul double %conv2, %k
+  %add = fadd double %retval.sroa.0.018, %mul
+  %g5 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 1
+  %1 = load i16, i16* %g5, align 2
+  %conv7 = uitofp i16 %1 to double
+  %mul8 = fmul double %conv7, %k
+  %add10 = fadd double %retval.sroa.4.019, %mul8
+  %inc = add nuw nsw i64 %i.020, 1
+  %exitcond = icmp eq i64 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-P8: load <2 x i16>
+; CHECK-P9-NOT: load <2 x i16>
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
new file mode 100644
index 00000000000..1a32f659066
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -debug-only=SLP \
+; RUN:   -S -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that SLP vectorizer gets the right cost difference for a compare
+; node.
+
+; Function Attrs: norecurse nounwind readonly
+define void @fun(i8* nocapture, i32 zeroext) local_unnamed_addr #0 {
+.lr.ph.preheader:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %2 = phi i32 [ %., %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %3 = phi i32 [ %.9, %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %4 = icmp ult i32 %2, %1
+  %5 = select i1 %4, i32 0, i32 %1
+  %. = sub i32 %2, %5
+  %6 = icmp ult i32 %3, %1
+  %7 = select i1 %6, i32 0, i32 %1
+  %.9 = sub i32 %3, %7
+  %8 = zext i32 %. to i64
+  %9 = getelementptr inbounds i8, i8* %0, i64 %8
+  %10 = load i8, i8* %9, align 1
+  %11 = zext i32 %.9 to i64
+  %12 = getelementptr inbounds i8, i8* %0, i64 %11
+  %13 = load i8, i8* %12, align 1
+  %14 = icmp eq i8 %10, %13
+  br i1 %14, label %.lr.ph, label %._crit_edge
+
+._crit_edge:                                      ; preds = %.lr.ph
+  ret void
+
+; CHECK: SLP: Adding cost -1 for bundle that starts with   %4 = icmp ult i32 %2, %1.
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/SystemZ/lit.local.cfg
new file mode 100644
index 00000000000..5c02dd3614a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'SystemZ' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
new file mode 100644
index 00000000000..3855834d8a1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -S < %s | FileCheck %s
+
+@bar = external global [4 x [4 x i32]], align 4
+@dct_luma = external global [4 x [4 x i32]], align 4
+
+define void @foo() local_unnamed_addr {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD277:%.*]] = add nsw i32 undef, undef
+; CHECK-NEXT:    store i32 [[ADD277]], i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
+; CHECK-NEXT:    [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
+; CHECK-NEXT:    [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
+; CHECK-NEXT:    [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[ADD277]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> undef, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = ashr <4 x i32> [[TMP7]], <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT:    [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %add277 = add nsw i32 undef, undef
+  store i32 %add277, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 1), align 4
+  %0 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 0), align 4
+  %sub355 = add nsw i32 undef, %0
+  %shr.i = ashr i32 %sub355, 6
+  %arrayidx372 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 0
+  store i32 %shr.i, i32* %arrayidx372, align 4
+  %sub355.1 = add nsw i32 undef, %add277
+  %shr.i.1 = ashr i32 %sub355.1, 6
+  %arrayidx372.1 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1
+  store i32 %shr.i.1, i32* %arrayidx372.1, align 4
+  %1 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2), align 4
+  %sub355.2 = add nsw i32 undef, %1
+  %shr.i.2 = ashr i32 %sub355.2, 6
+  %arrayidx372.2 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2
+  store i32 %shr.i.2, i32* %arrayidx372.2, align 4
+  %2 = load i32, i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 3), align 4
+  %sub355.3 = add nsw i32 undef, %2
+  %shr.i.3 = ashr i32 %sub355.3, 6
+  %arrayidx372.3 = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3
+  store i32 %shr.i.3, i32* %arrayidx372.3, align 4
+  unreachable
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
new file mode 100644
index 00000000000..9b5d412065f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+
+define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
+; CHECK-LABEL: @i64_simplified(
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
+
+  %t0 = load i64, i64* %ld, align 8
+  %t1 = load i64, i64* %arrayidx1, align 8
+
+  %arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %st, i64 2
+  %arrayidx5 = getelementptr inbounds i64, i64* %st, i64 3
+
+  store i64 %t0, i64* %st, align 8
+  store i64 %t1, i64* %arrayidx3, align 8
+  store i64 %t0, i64* %arrayidx4, align 8
+  store i64 %t1, i64* %arrayidx5, align 8
+  ret void
+}
+
+define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) {
+; CHECK-LABEL: @i64_simplifiedi_reversed(
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
+
+  %t0 = load i64, i64* %ld, align 8
+  %t1 = load i64, i64* %arrayidx1, align 8
+
+  %arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %st, i64 2
+  %arrayidx5 = getelementptr inbounds i64, i64* %st, i64 3
+
+  store i64 %t1, i64* %st, align 8
+  store i64 %t0, i64* %arrayidx3, align 8
+  store i64 %t1, i64* %arrayidx4, align 8
+  store i64 %t0, i64* %arrayidx5, align 8
+  ret void
+}
+
+define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) {
+; CHECK-LABEL: @i64_simplifiedi_extract(
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[LD]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
+
+  %t0 = load i64, i64* %ld, align 8
+  %t1 = load i64, i64* %arrayidx1, align 8
+
+  %arrayidx3 = getelementptr inbounds i64, i64* %st, i64 1
+  %arrayidx4 = getelementptr inbounds i64, i64* %st, i64 2
+  %arrayidx5 = getelementptr inbounds i64, i64* %st, i64 3
+
+  store i64 %t0, i64* %st, align 8
+  store i64 %t0, i64* %arrayidx3, align 8
+  store i64 %t0, i64* %arrayidx4, align 8
+  store i64 %t1, i64* %arrayidx5, align 8
+  store i64 %t1, i64* %ld, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll
new file mode 100644
index 00000000000..e36d6f3e454
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S -mcpu=corei7 | FileCheck %s
+
+define i32 @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca <8 x i32>, align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32>* [[TMP]] to [8 x i32]*
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32>* [[TMP]] to i8*
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <8 x i32>, <8 x i32>* [[TMP]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 5
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[TMP1]], i64 0, i64 7
+; CHECK-NEXT:    store <8 x i32> <i32 -221320154, i32 -756426931, i32 563883532, i32 382683935, i32 144890241, i32 -1052877364, i32 -1052877364, i32 -1016007675>, <8 x i32>* [[TMP]], align 32
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP2]] to i8*
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP4]], align 32
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp slt i32 [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP19]], i32 2, i32 [[TMP16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp slt i32 [[TMP22]], [[TMP20]]
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP23]], i32 3, i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* [[TMP8]], align 16
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp slt i32 [[TMP26]], [[TMP24]]
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 [[TMP24]]
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 4, i32 [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp slt i32 [[TMP30]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP28]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 5, i32 [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = load i32, i32* [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp slt i32 [[TMP34]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP35]], i32 6, i32 [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i32, i32* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp slt i32 [[TMP38]], [[TMP36]]
+; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 7, i32 [[TMP37]]
+; CHECK-NEXT:    store i32 [[TMP40]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret i32 0
+;
+bb:
+  %tmp = alloca <8 x i32>, align 32
+  %tmp1 = bitcast <8 x i32>* %tmp to [8 x i32]*
+  %tmp2 = alloca i32, align 4
+  %tmp3 = bitcast <8 x i32>* %tmp to i8*
+  %tmp4 = getelementptr inbounds <8 x i32>, <8 x i32>* %tmp, i64 0, i64 0
+  %tmp5 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 1
+  %tmp6 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 2
+  %tmp7 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 3
+  %tmp8 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 4
+  %tmp9 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 6
+  %tmp10 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 5
+  %tmp11 = getelementptr inbounds [8 x i32], [8 x i32]* %tmp1, i64 0, i64 7
+  store <8 x i32> <i32 -221320154, i32 -756426931, i32 563883532, i32 382683935, i32 144890241, i32 -1052877364, i32 -1052877364, i32 -1016007675>, <8 x i32>* %tmp, align 32
+  %tmp12 = bitcast i32* %tmp2 to i8*
+  %tmp13 = load i32, i32* %tmp4, align 32
+  %tmp14 = load i32, i32* %tmp5, align 4
+  %tmp15 = icmp slt i32 %tmp14, %tmp13
+  %tmp16 = select i1 %tmp15, i32 %tmp14, i32 %tmp13
+  %tmp17 = zext i1 %tmp15 to i32
+  %tmp18 = load i32, i32* %tmp6, align 8
+  %tmp19 = icmp slt i32 %tmp18, %tmp16
+  %tmp20 = select i1 %tmp19, i32 %tmp18, i32 %tmp16
+  %tmp21 = select i1 %tmp19, i32 2, i32 %tmp16
+  %tmp22 = load i32, i32* %tmp7, align 4
+  %tmp23 = icmp slt i32 %tmp22, %tmp20
+  %tmp24 = select i1 %tmp23, i32 %tmp22, i32 %tmp20
+  %tmp25 = select i1 %tmp23, i32 3, i32 %tmp21
+  %tmp26 = load i32, i32* %tmp8, align 16
+  %tmp27 = icmp slt i32 %tmp26, %tmp24
+  %tmp28 = select i1 %tmp27, i32 %tmp26, i32 %tmp24
+  %tmp29 = select i1 %tmp27, i32 4, i32 %tmp25
+  %tmp30 = load i32, i32* %tmp10, align 4
+  %tmp31 = icmp slt i32 %tmp30, %tmp28
+  %tmp32 = select i1 %tmp31, i32 %tmp30, i32 %tmp28
+  %tmp33 = select i1 %tmp31, i32 5, i32 %tmp29
+  %tmp34 = load i32, i32* %tmp9, align 8
+  %tmp35 = icmp slt i32 %tmp34, %tmp32
+  %tmp36 = select i1 %tmp35, i32 %tmp34, i32 %tmp32
+  %tmp37 = select i1 %tmp35, i32 6, i32 %tmp33
+  %tmp38 = load i32, i32* %tmp11, align 4
+  %tmp39 = icmp slt i32 %tmp38, %tmp36
+  %tmp40 = select i1 %tmp39, i32 7, i32 %tmp37
+  store i32 %tmp40, i32* %tmp2, align 4
+  ret i32 0
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
new file mode 100644
index 00000000000..625748f6820
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @mainTest(i32* %ptr) #0  {
+; CHECK-LABEL: @mainTest(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA5:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 1, undef
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], [[TMP6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP13]], [[TMP5]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i32 [[TMP16]], 1
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]]
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], [[TMP6]]
+; CHECK-NEXT:    [[OP_EXTRA5]] = add i32 [[OP_EXTRA4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP15]], undef
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       bail_out:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i32* %ptr, null
+  br i1 %cmp, label %loop, label %bail_out
+
+loop:
+  %dummy_phi = phi i32 [ 1, %entry ], [ %18, %loop ]
+  %0 = load i32, i32 * %ptr , align 4
+  %1 = mul i32 %0, %0
+  %2 = add i32 1, %1
+  %3 = getelementptr inbounds i32, i32 * %ptr, i64 1
+  %4 = load i32, i32 * %3 , align 4
+  %5 = mul i32 %4, %4
+  %6 = add i32 %2, %4
+  %7 = add i32 %6, %5
+  %8 = getelementptr inbounds i32, i32 *%ptr, i64 2
+  %9 = load i32, i32 * %8 , align 4
+  %10 = mul i32 %9, %9
+  %11 = add i32 %7, %9
+  %12 = add i32 %11, %10
+  %13 = sext i32 %9 to i64
+  %14 = getelementptr inbounds i32, i32 *%ptr, i64 3
+  %15 = load i32, i32 * %14 , align 4
+  %16 = mul i32 %15, %15
+  %17 = add i32 %12, %15
+  %18 = add i32 %17, %16
+  br label %loop
+
+bail_out:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="westmere" }
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
new file mode 100644
index 00000000000..712ff040a91
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @test() #0 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP6:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DUMMY_ADD:%.*]] = add i16 0, 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP4]], <i64 3, i64 2, i64 1, i64 0>
+; CHECK-NEXT:    [[TMP6]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[SUM1:%.*]] = add i64 undef, undef
+; CHECK-NEXT:    [[SUM2:%.*]] = add i64 [[SUM1]], undef
+; CHECK-NEXT:    [[ZSUM:%.*]] = add i64 [[SUM2]], 0
+; CHECK-NEXT:    [[JOIN:%.*]] = add i64 [[TMP6]], [[ZSUM]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NEXT:    [[OP_EXTRA3]] = add i64 [[OP_EXTRA]], [[TMP6]]
+; CHECK-NEXT:    [[LAST:%.*]] = add i64 [[JOIN]], undef
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %dummy_phi = phi i64 [ 1, %entry ], [ %last, %loop ]
+  %0 = phi i64 [ 2, %entry ], [ %fork, %loop ]
+  %inc1 = add i64 %0, 1
+  %inc2 = add i64 %0, 2
+  %inc11 = add i64 1, %inc1
+  %exact1 = ashr exact i64 %inc11, 32
+  %inc3 = add i64 %0, 3
+  %dummy_add = add i16 0, 0
+  %inc12 = add i64 1, %inc2
+  %exact2 = ashr exact i64 %inc12, 32
+  %dummy_shl = shl i64 %inc3, 32
+  %inc13 = add i64 1, %inc3
+  %exact3 = ashr exact i64 %inc13, 32
+  %fork = add i64 %0, 0
+  %sum1 = add i64 %exact3, %exact2
+  %sum2 = add i64 %sum1, %exact1
+  %zsum = add i64 %sum2, 0
+  %sext22 = add i64 1, %fork
+  %exact4 = ashr exact i64 %sext22, 32
+  %join = add i64 %fork, %zsum
+  %last = add i64 %join, %exact4
+  br label %loop
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
new file mode 100644
index 00000000000..4a403e7b9a3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -verify -slp-vectorizer -o - -S -mtriple=x86_64-apple-macosx10.13.0 | FileCheck %s
+
+@global = local_unnamed_addr global [6 x double] zeroinitializer, align 16
+
+define { i64, i64 } @patatino(double %arg) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP11]], 1
+; CHECK-NEXT:    ret { i64, i64 } [[TMP17]]
+;
+bb:
+  %tmp = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16
+  %tmp1 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16
+  %tmp2 = fmul double %tmp1, %arg
+  %tmp3 = fadd double %tmp, %tmp2
+  %tmp4 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16
+  %tmp5 = fadd double %tmp4, %tmp3
+  %tmp6 = fptosi double %tmp5 to i32
+  %tmp7 = sext i32 %tmp6 to i64
+  %tmp8 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8
+  %tmp9 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8
+  %tmp10 = fmul double %tmp9, %arg
+  %tmp11 = fadd double %tmp8, %tmp10
+  %tmp12 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8
+  %tmp13 = fadd double %tmp12, %tmp11
+  %tmp14 = fptosi double %tmp13 to i32
+  %tmp15 = sext i32 %tmp14 to i64
+  %tmp16 = insertvalue { i64, i64 } undef, i64 %tmp7, 0
+  %tmp17 = insertvalue { i64, i64 } %tmp16, i64 %tmp15, 1
+  ret { i64, i64 } %tmp17
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
new file mode 100644
index 00000000000..b022dd7d915
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35865.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
+; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
+; CHECK-NEXT:    [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32
+; CHECK-NEXT:    [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5
+; CHECK-NEXT:    [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32
+; CHECK-NEXT:    [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <16 x half> undef, i32 4
+  %conv.i.4.i = fpext half %0 to float
+  %1 = bitcast float %conv.i.4.i to i32
+  %vecins.i.4.i = insertelement <8 x i32> undef, i32 %1, i32 4
+  %2 = extractelement <16 x half> undef, i32 5
+  %conv.i.5.i = fpext half %2 to float
+  %3 = bitcast float %conv.i.5.i to i32
+  %vecins.i.5.i = insertelement <8 x i32> %vecins.i.4.i, i32 %3, i32 5
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR36280.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR36280.ll
new file mode 100644
index 00000000000..1001468fd6f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR36280.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+
+define float @jacobi(float* %p, float %x, float %y, float %z) {
+; CHECK-LABEL: @jacobi(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, float* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, float* [[P]], i64 2
+; CHECK-NEXT:    [[P1:%.*]] = load float, float* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = load float, float* [[GEP2]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul float [[P1]], [[X:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul float [[P2]], [[Y:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[MUL1]], [[Z:%.*]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[MUL2]], [[ADD1]]
+; CHECK-NEXT:    ret float [[ADD2]]
+;
+  %gep1 = getelementptr float, float* %p, i64 1
+  %gep2 = getelementptr float, float* %p, i64 2
+  %p1 = load float, float* %gep1
+  %p2 = load float, float* %gep2
+  %mul1 = fmul float %p1, %x
+  %mul2 = fmul float %p2, %y
+  %add1 = fadd float %mul1, %z
+  %add2 = fadd float %mul2, %add1
+  ret float %add2
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
new file mode 100644
index 00000000000..24f75b32c5d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-7 | FileCheck %s --check-prefixes=ALL,CHECK
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefixes=ALL,FORCE_REDUCTION
+
+define void @Test(i32) {
+; CHECK-LABEL: @Test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; CHECK-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
+; CHECK-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
+; CHECK-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
+; CHECK-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
+; CHECK-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
+; CHECK-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], undef
+; CHECK-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], undef
+; CHECK-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_40:%.*]] = and i32 [[VAL_38]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]]
+; CHECK-NEXT:    [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]]
+; CHECK-NEXT:    [[VAL_42:%.*]] = and i32 [[VAL_40]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 14910, i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    br label [[LOOP]]
+;
+; FORCE_REDUCTION-LABEL: @Test(
+; FORCE_REDUCTION-NEXT:  entry:
+; FORCE_REDUCTION-NEXT:    br label [[LOOP:%.*]]
+; FORCE_REDUCTION:       loop:
+; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
+; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
+; FORCE_REDUCTION-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
+; FORCE_REDUCTION-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_7:%.*]] = and i32 [[VAL_5]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_12:%.*]] = and i32 [[VAL_10]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_19:%.*]] = and i32 [[VAL_17]], undef
+; FORCE_REDUCTION-NEXT:    [[VAL_20:%.*]] = add i32 [[TMP2]], 1496
+; FORCE_REDUCTION-NEXT:    [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]]
+; FORCE_REDUCTION-NEXT:    [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_34:%.*]] = add i32 [[TMP2]], 8555
+; FORCE_REDUCTION-NEXT:    [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]]
+; FORCE_REDUCTION-NEXT:    [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; FORCE_REDUCTION-NEXT:    [[BIN_RDX:%.*]] = and <4 x i32> [[TMP3]], [[RDX_SHUF]]
+; FORCE_REDUCTION-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; FORCE_REDUCTION-NEXT:    [[BIN_RDX2:%.*]] = and <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; FORCE_REDUCTION-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]]
+; FORCE_REDUCTION-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP2]]
+; FORCE_REDUCTION-NEXT:    [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]]
+; FORCE_REDUCTION-NEXT:    [[VAL_39:%.*]] = add i32 [[TMP2]], 12529
+; FORCE_REDUCTION-NEXT:    [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]]
+; FORCE_REDUCTION-NEXT:    [[VAL_41:%.*]] = add i32 [[TMP2]], 13685
+; FORCE_REDUCTION-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0
+; FORCE_REDUCTION-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1
+; FORCE_REDUCTION-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]]
+; FORCE_REDUCTION-NEXT:    [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]]
+; FORCE_REDUCTION-NEXT:    [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; FORCE_REDUCTION-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %local_4_39.us = phi i32 [ %val_42, %loop ], [ 0, %entry ]
+  %local_8_43.us = phi i32 [ %val_43, %loop ], [ 0, %entry ]
+  %val_0 = add i32 %local_4_39.us, 0
+  %val_1 = and i32 %local_8_43.us, %val_0
+  %val_2 = and i32 %val_1, %0
+  %val_3 = and i32 %val_2, %0
+  %val_4 = and i32 %val_3, %0
+  %val_5 = and i32 %val_4, %0
+  %val_6 = add i32 %local_8_43.us, 55
+  %val_7 = and i32 %val_5, %val_6
+  %val_8 = and i32 %val_7, %0
+  %val_9 = and i32 %val_8, %0
+  %val_10 = and i32 %val_9, %0
+  %val_11 = add i32 %local_8_43.us, 285
+  %val_12 = and i32 %val_10, %val_11
+  %val_13 = and i32 %val_12, %0
+  %val_14 = and i32 %val_13, %0
+  %val_15 = and i32 %val_14, %0
+  %val_16 = and i32 %val_15, %0
+  %val_17 = and i32 %val_16, %0
+  %val_18 = add i32 %local_8_43.us, 1240
+  %val_19 = and i32 %val_17, %val_18
+  %val_20 = add i32 %local_8_43.us, 1496
+  %val_21 = and i32 %val_19, %val_20
+  %val_22 = and i32 %val_21, %0
+  %val_23 = and i32 %val_22, %0
+  %val_24 = and i32 %val_23, %0
+  %val_25 = and i32 %val_24, %0
+  %val_26 = and i32 %val_25, %0
+  %val_27 = and i32 %val_26, %0
+  %val_28 = and i32 %val_27, %0
+  %val_29 = and i32 %val_28, %0
+  %val_30 = and i32 %val_29, %0
+  %val_31 = and i32 %val_30, %0
+  %val_32 = and i32 %val_31, %0
+  %val_33 = and i32 %val_32, %0
+  %val_34 = add i32 %local_8_43.us, 8555
+  %val_35 = and i32 %val_33, %val_34
+  %val_36 = and i32 %val_35, %0
+  %val_37 = and i32 %val_36, %0
+  %val_38 = and i32 %val_37, %0
+  %val_39 = add i32 %local_8_43.us, 12529
+  %val_40 = and i32 %val_38, %val_39
+  %val_41 = add i32 %local_8_43.us, 13685
+  %val_42 = and i32 %val_40, %val_41
+  %val_43 = add i32 %local_8_43.us, 14910
+  br label %loop
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
new file mode 100644
index 00000000000..2a0b66ee281
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s
+
+define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
+; CHECK-LABEL: @mainTest(
+; CHECK-NEXT:  bci_15.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 31, i32 undef>, i32 [[PARAM:%.*]], i32 1
+; CHECK-NEXT:    br label [[BCI_15:%.*]]
+; CHECK:       bci_15:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
+; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
+; CHECK-NEXT:    [[V14:%.*]] = and i32 [[TMP2]], undef
+; CHECK-NEXT:    [[V16:%.*]] = and i32 undef, [[V14]]
+; CHECK-NEXT:    [[V18:%.*]] = and i32 undef, [[V16]]
+; CHECK-NEXT:    [[V20:%.*]] = and i32 undef, [[V18]]
+; CHECK-NEXT:    [[V22:%.*]] = and i32 undef, [[V20]]
+; CHECK-NEXT:    [[V24:%.*]] = and i32 undef, [[V22]]
+; CHECK-NEXT:    [[V26:%.*]] = and i32 undef, [[V24]]
+; CHECK-NEXT:    [[V28:%.*]] = and i32 undef, [[V26]]
+; CHECK-NEXT:    [[V30:%.*]] = and i32 undef, [[V28]]
+; CHECK-NEXT:    [[V32:%.*]] = and i32 undef, [[V30]]
+; CHECK-NEXT:    [[V34:%.*]] = and i32 undef, [[V32]]
+; CHECK-NEXT:    [[V36:%.*]] = and i32 undef, [[V34]]
+; CHECK-NEXT:    [[V38:%.*]] = and i32 undef, [[V36]]
+; CHECK-NEXT:    [[V40:%.*]] = and i32 undef, [[V38]]
+; CHECK-NEXT:    [[V42:%.*]] = and i32 undef, [[V40]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <16 x i32> [[TMP4]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = and <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[V43:%.*]] = and i32 undef, [[V42]]
+; CHECK-NEXT:    [[V44:%.*]] = add i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0
+; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1
+; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
+; CHECK:       loopexit:
+; CHECK-NEXT:    ret void
+;
+bci_15.preheader:
+  br label %bci_15
+
+bci_15:                                       ; preds = %bci_15.preheader, %bci_15
+  %local_0_ = phi i32 [ %v43, %bci_15 ], [ %param, %bci_15.preheader ]
+  %local_4_ = phi i32 [ %v44, %bci_15 ], [ 31, %bci_15.preheader ]
+  %v12 = add i32 %local_0_, -1
+  store atomic i32 %local_0_, i32 * %vals unordered, align 4
+  %v13 = add i32 %local_4_, 1
+  %v14 = and i32 %local_4_, %v12
+  %v15 = add i32 %local_4_, 2
+  %v16 = and i32 %v13, %v14
+  %v17 = add i32 %local_4_, 3
+  %v18 = and i32 %v15, %v16
+  %v19 = add i32 %local_4_, 4
+  %v20 = and i32 %v17, %v18
+  %v21 = add i32 %local_4_, 5
+  %v22 = and i32 %v19, %v20
+  %v23 = add i32 %local_4_, 6
+  %v24 = and i32 %v21, %v22
+  %v25 = add i32 %local_4_, 7
+  %v26 = and i32 %v23, %v24
+  %v27 = add i32 %local_4_, 8
+  %v28 = and i32 %v25, %v26
+  %v29 = add i32 %local_4_, 9
+  %v30 = and i32 %v27, %v28
+  %v31 = add i32 %local_4_, 10
+  %v32 = and i32 %v29, %v30
+  %v33 = add i32 %local_4_, 11
+  %v34 = and i32 %v31, %v32
+  %v35 = add i32 %local_4_, 12
+  %v36 = and i32 %v33, %v34
+  %v37 = add i32 %local_4_, 13
+  %v38 = and i32 %v35, %v36
+  %v39 = add i32 %local_4_, 14
+  %v40 = and i32 %v37, %v38
+  %v41 = add i32 %local_4_, 15
+  %v42 = and i32 %v39, %v40
+  %v43 = and i32 %v41, %v42
+  %v44 = add i32 %local_4_, 16
+  br i1 true, label %bci_15, label %loopexit
+
+loopexit:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
new file mode 100644
index 00000000000..510bc9e5b20
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [4 x i32] zeroinitializer, align 16
+@c = common global [4 x i32] zeroinitializer, align 16
+@d = common global [4 x i32] zeroinitializer, align 16
+@e = common global [4 x i32] zeroinitializer, align 16
+@a = common global [4 x i32] zeroinitializer, align 16
+@fb = common global [4 x float] zeroinitializer, align 16
+@fc = common global [4 x float] zeroinitializer, align 16
+@fa = common global [4 x float] zeroinitializer, align 16
+@fd = common global [4 x float] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @addsub() #0 {
+; CHECK-LABEL: @addsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %add2 = add nsw i32 %add, %add1
+  store i32 %add2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %4, %5
+  %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4
+  %add4 = add nsw i32 %6, %7
+  %sub = sub nsw i32 %add3, %add4
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %add7 = add nsw i32 %add5, %add6
+  store i32 %add7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %sub10 = sub nsw i32 %add8, %add9
+  store i32 %sub10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @subadd() #0 {
+; CHECK-LABEL: @subadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %sub = sub nsw i32 %add, %add1
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4
+  %add2 = add nsw i32 %4, %5
+  %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %6, %7
+  %add4 = add nsw i32 %add2, %add3
+  store i32 %add4, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %sub7 = sub nsw i32 %add5, %add6
+  store i32 %sub7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %add10 = add nsw i32 %add8, %add9
+  store i32 %add10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @faddfsub() #0 {
+; CHECK-LABEL: @faddfsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %sub = fsub float %2, %3
+  store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %add1 = fadd float %4, %5
+  store float %add1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  %sub2 = fsub float %6, %7
+  store float %sub2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @fsubfadd() #0 {
+; CHECK-LABEL: @fsubfadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %sub = fsub float %0, %1
+  store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %add = fadd float %2, %3
+  store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %sub1 = fsub float %4, %5
+  store float %sub1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  %add2 = fadd float %6, %7
+  store float %add2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; Function Attrs: nounwind uwtable
+define void @faddfsub_select() #0 {
+; CHECK-LABEL: @faddfsub_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %add1 = fadd float %2, %3
+  store float %add1, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %add2 = fadd float %4, %5
+  store float %add2, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  %sub = fsub float %6, %7
+  store float %sub, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  fc[0] = fb[0]+fa[0]; //swapped fb and fa
+;  fc[1] = fa[1]-fb[1];
+;  fc[2] = fa[2]+fb[2];
+;  fc[3] = fa[3]-fb[3];
+
+define void @reorder_alt() #0 {
+; CHECK-LABEL: @reorder_alt(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %3 = fadd float %1, %2
+  store float %3, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %6 = fsub float %4, %5
+  store float %6, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %9 = fadd float %7, %8
+  store float %9, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %10 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %12 = fsub float %10, %11
+  store float %12, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  fc[0] = fa[0]+(fb[0]-fd[0]);
+;  fc[1] = fa[1]-(fb[1]+fd[1]);
+;  fc[2] = fa[2]+(fb[2]-fd[2]);
+;  fc[3] = fa[3]-(fd[3]+fb[3]); //swapped fd and fb
+
+define void @reorder_alt_subTree() #0 {
+; CHECK-LABEL: @reorder_alt_subTree(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fa to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fd to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([4 x float]* @fb to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fsub <4 x float> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([4 x float]* @fc to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %3 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 0), align 4
+  %4 = fsub float %2, %3
+  %5 = fadd float %1, %4
+  store float %5, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %6 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 1), align 4
+  %9 = fadd float %7, %8
+  %10 = fsub float %6, %9
+  store float %10, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %12 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %13 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 2), align 4
+  %14 = fsub float %12, %13
+  %15 = fadd float %11, %14
+  store float %15, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %16 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  %17 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fd, i32 0, i64 3), align 4
+  %18 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %19 = fadd float %17, %18
+  %20 = fsub float %16, %19
+  store float %20, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+; Check vectorization of following code for double data type-
+;  c[0] = (a[0]+b[0])-d[0];
+;  c[1] = d[1]+(a[1]+b[1]); //swapped d[1] and (a[1]+b[1])
+
+define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) {
+; CHECK-LABEL: @reorder_alt_rightsubTree(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = load double, double* %a
+  %2 = load double, double* %b
+  %3 = fadd double %1, %2
+  %4 = load double, double* %d
+  %5 = fsub double %3, %4
+  store double %5, double* %c
+  %6 = getelementptr inbounds double, double* %d, i64 1
+  %7 = load double, double* %6
+  %8 = getelementptr inbounds double, double* %a, i64 1
+  %9 = load double, double* %8
+  %10 = getelementptr inbounds double, double* %b, i64 1
+  %11 = load double, double* %10
+  %12 = fadd double %9, %11
+  %13 = fadd double %7, %12
+  %14 = getelementptr inbounds double, double* %c, i64 1
+  store double %13, double* %14
+  ret void
+}
+
+; Dont vectorization of following code for float data type as sub is not commutative-
+;  fc[0] = fb[0]+fa[0];
+;  fc[1] = fa[1]-fb[1];
+;  fc[2] = fa[2]+fb[2];
+;  fc[3] = fb[3]-fa[3];
+;  In the above code we can swap the 1st and 2nd operation as fadd is commutative
+;  but not 2nd or 4th as fsub is not commutative.
+
+define void @no_vec_shuff_reorder() #0 {
+; CHECK-LABEL: @no_vec_shuff_reorder(
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4
+  %2 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4
+  %3 = fadd float %1, %2
+  store float %3, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4
+  %4 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4
+  %5 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4
+  %6 = fsub float %4, %5
+  store float %6, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4
+  %7 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4
+  %8 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4
+  %9 = fadd float %7, %8
+  store float %9, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4
+  %10 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4
+  %11 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4
+  %12 = fsub float %10, %11
+  store float %12, float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4
+  ret void
+}
+
+
+attributes #0 = { nounwind }
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/aggregate.ll b/llvm/test/Transforms/SLPVectorizer/X86/aggregate.ll
new file mode 100644
index 00000000000..f270dbf4f78
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/aggregate.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-unknown-linux -mcpu=corei7 -slp-vectorizer < %s | FileCheck %s
+
+%struct.S = type { i8*, i8* }
+
+@kS0 = common global %struct.S zeroinitializer, align 8
+
+define { i64, i64 } @getS() {
+; CHECK-LABEL: @getS(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue { i64, i64 } [[TMP2]], i64 [[TMP1]], 1
+; CHECK-NEXT:    ret { i64, i64 } [[TMP3]]
+;
+entry:
+  %0 = load i64, i64* bitcast (%struct.S* @kS0 to i64*), align 8
+  %1 = load i64, i64* bitcast (i8** getelementptr inbounds (%struct.S, %struct.S* @kS0, i64 0, i32 1) to i64*), align 8
+  %2 = insertvalue { i64, i64 } undef, i64 %0, 0
+  %3 = insertvalue { i64, i64 } %2, i64 %1, 1
+  ret { i64, i64 } %3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/align.ll b/llvm/test/Transforms/SLPVectorizer/X86/align.ll
new file mode 100644
index 00000000000..5c7c4ceae81
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/align.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AGG_TMP_I_I_SROA_0:%.*]] = alloca [3 x double], align 16
+; CHECK-NEXT:    [[STORE1:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 1
+; CHECK-NEXT:    [[STORE2:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[STORE1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %agg.tmp.i.i.sroa.0 = alloca [3 x double], align 16
+  %i0 = load double, double* %a
+  %i1 = load double, double* %b
+  %mul = fmul double %i0, %i1
+  %store1 = getelementptr inbounds [3 x double], [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 1
+  %store2 = getelementptr inbounds [3 x double], [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 2
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %store1
+  store double %mul5, double* %store2, align 16
+  ret void
+}
+
+; Float has 4 byte abi alignment on x86_64. We must use the alignmnet of the
+; value being loaded/stored not the alignment of the pointer type.
+
+define void @test2(float * %a, float * %b) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l0 = load float, float* %a
+  %a1 = getelementptr inbounds float, float* %a, i64 1
+  %l1 = load float, float* %a1
+  %a2 = getelementptr inbounds float, float* %a, i64 2
+  %l2 = load float, float* %a2
+  %a3 = getelementptr inbounds float, float* %a, i64 3
+  %l3 = load float, float* %a3
+  store float %l0, float* %b
+  %b1 = getelementptr inbounds float, float* %b, i64 1
+  store float %l1, float* %b1
+  %b2 = getelementptr inbounds float, float* %b, i64 2
+  store float %l2, float* %b2
+  %b3 = getelementptr inbounds float, float* %b, i64 3
+  store float %l3, float* %b3
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
new file mode 100644
index 00000000000..4c72225abb2
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x float> @ceil_floor(<8 x float> %a) {
+; CHECK-LABEL: @ceil_floor(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; CHECK-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; CHECK-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
+; CHECK-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
+; CHECK-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; CHECK-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
+; CHECK-NEXT:    [[AB4:%.*]] = call float @llvm.ceil.f32(float [[A4]])
+; CHECK-NEXT:    [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]])
+; CHECK-NEXT:    [[AB6:%.*]] = call float @llvm.floor.f32(float [[A6]])
+; CHECK-NEXT:    [[AB7:%.*]] = call float @llvm.floor.f32(float [[A7]])
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %ab0 = call float @llvm.ceil.f32(float %a0)
+  %ab1 = call float @llvm.floor.f32(float %a1)
+  %ab2 = call float @llvm.floor.f32(float %a2)
+  %ab3 = call float @llvm.ceil.f32(float %a3)
+  %ab4 = call float @llvm.ceil.f32(float %a4)
+  %ab5 = call float @llvm.ceil.f32(float %a5)
+  %ab6 = call float @llvm.floor.f32(float %a6)
+  %ab7 = call float @llvm.floor.f32(float %a7)
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+declare float @llvm.ceil.f32(float)
+declare float @llvm.floor.f32(float)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
new file mode 100644
index 00000000000..8f8b1d443da
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -0,0 +1,489 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
+; SSE-LABEL: @sitofp_uitofp(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
+; SSE-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
+; SSE-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
+; SSE-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
+; SSE-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
+; SSE-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_uitofp(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SLM-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
+; SLM-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
+; SLM-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
+; SLM-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
+; SLM-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
+; SLM-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @sitofp_uitofp(
+; AVX-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp(
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = sitofp i32 %a2 to float
+  %ab3 = sitofp i32 %a3 to float
+  %ab4 = uitofp i32 %a4 to float
+  %ab5 = uitofp i32 %a5 to float
+  %ab6 = uitofp i32 %a6 to float
+  %ab7 = uitofp i32 %a7 to float
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x i32> @fptosi_fptoui(<8 x float> %a) {
+; SSE-LABEL: @fptosi_fptoui(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; SSE-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; SSE-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; SSE-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SSE-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; SSE-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; SSE-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; SSE-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @fptosi_fptoui(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; SLM-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; SLM-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; SLM-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; SLM-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; SLM-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; SLM-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; SLM-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @fptosi_fptoui(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2
+; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3
+; AVX-NEXT:    [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4
+; AVX-NEXT:    [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5
+; AVX-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6
+; AVX-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7
+; AVX-NEXT:    [[AB0:%.*]] = fptosi float [[A0]] to i32
+; AVX-NEXT:    [[AB1:%.*]] = fptosi float [[A1]] to i32
+; AVX-NEXT:    [[AB2:%.*]] = fptosi float [[A2]] to i32
+; AVX-NEXT:    [[AB3:%.*]] = fptosi float [[A3]] to i32
+; AVX-NEXT:    [[AB4:%.*]] = fptoui float [[A4]] to i32
+; AVX-NEXT:    [[AB5:%.*]] = fptoui float [[A5]] to i32
+; AVX-NEXT:    [[AB6:%.*]] = fptoui float [[A6]] to i32
+; AVX-NEXT:    [[AB7:%.*]] = fptoui float [[A7]] to i32
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @fptosi_fptoui(
+; AVX512-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[A]] to <8 x i32>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %ab0 = fptosi float %a0 to i32
+  %ab1 = fptosi float %a1 to i32
+  %ab2 = fptosi float %a2 to i32
+  %ab3 = fptosi float %a3 to i32
+  %ab4 = fptoui float %a4 to i32
+  %ab5 = fptoui float %a5 to i32
+  %ab6 = fptoui float %a6 to i32
+  %ab7 = fptoui float %a7 to i32
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x float> @fneg_fabs(<8 x float> %a) {
+; CHECK-LABEL: @fneg_fabs(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i32> [[TMP1]], <i32 undef, i32 undef, i32 undef, i32 undef, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP4]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[TMP5]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %aa0 = bitcast float %a0 to i32
+  %aa1 = bitcast float %a1 to i32
+  %aa2 = bitcast float %a2 to i32
+  %aa3 = bitcast float %a3 to i32
+  %aa4 = bitcast float %a4 to i32
+  %aa5 = bitcast float %a5 to i32
+  %aa6 = bitcast float %a6 to i32
+  %aa7 = bitcast float %a7 to i32
+  %ab0 = xor i32 %aa0, -2147483648
+  %ab1 = xor i32 %aa1, -2147483648
+  %ab2 = xor i32 %aa2, -2147483648
+  %ab3 = xor i32 %aa3, -2147483648
+  %ab4 = and i32 %aa4, 2147483647
+  %ab5 = and i32 %aa5, 2147483647
+  %ab6 = and i32 %aa6, 2147483647
+  %ab7 = and i32 %aa7, 2147483647
+  %ac0 = bitcast i32 %ab0 to float
+  %ac1 = bitcast i32 %ab1 to float
+  %ac2 = bitcast i32 %ab2 to float
+  %ac3 = bitcast i32 %ab3 to float
+  %ac4 = bitcast i32 %ab4 to float
+  %ac5 = bitcast i32 %ab5 to float
+  %ac6 = bitcast i32 %ab6 to float
+  %ac7 = bitcast i32 %ab7 to float
+  %r0 = insertelement <8 x float> undef, float %ac0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ac1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ac2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ac3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ac4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ac5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ac6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ac7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x i32> @sext_zext(<8 x i16> %a) {
+; CHECK-LABEL: @sext_zext(
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %ab0 = sext i16 %a0 to i32
+  %ab1 = sext i16 %a1 to i32
+  %ab2 = sext i16 %a2 to i32
+  %ab3 = sext i16 %a3 to i32
+  %ab4 = zext i16 %a4 to i32
+  %ab5 = zext i16 %a5 to i32
+  %ab6 = zext i16 %a6 to i32
+  %ab7 = zext i16 %a7 to i32
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x float> @sitofp_4i32_8i16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sitofp_4i32_8i16(
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <8 x i16> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <8 x i16> [[B]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; CHECK-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; CHECK-NEXT:    [[AB5:%.*]] = sitofp i16 [[B1]] to float
+; CHECK-NEXT:    [[AB6:%.*]] = sitofp i16 [[B2]] to float
+; CHECK-NEXT:    [[AB7:%.*]] = sitofp i16 [[B3]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = sitofp i32 %a2 to float
+  %ab3 = sitofp i32 %a3 to float
+  %ab4 = sitofp i16 %b0 to float
+  %ab5 = sitofp i16 %b1 to float
+  %ab6 = sitofp i16 %b2 to float
+  %ab7 = sitofp i16 %b3 to float
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+; Inspired by PR38154
+define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) {
+; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SSE-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; SSE-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; SSE-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SSE-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SSE-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; SSE-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SSE-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SSE-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SSE-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; SSE-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; SLM-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; SLM-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; SLM-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; SLM-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; SLM-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; SLM-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; AVX-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; AVX-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; AVX-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
+; AVX-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
+; AVX-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
+; AVX-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; AVX-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
+; AVX512-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
+; AVX512-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
+; AVX512-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
+; AVX512-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
+; AVX512-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
+; AVX512-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
+; AVX512-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
+; AVX512-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
+; AVX512-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %c0 = extractelement <16 x i8> %c, i32 0
+  %c1 = extractelement <16 x i8> %c, i32 1
+  %ab0 = sitofp i32 %a0 to float
+  %ab1 = sitofp i32 %a1 to float
+  %ab2 = uitofp i32 %a2 to float
+  %ab3 = uitofp i32 %a3 to float
+  %ab4 = sitofp i16 %b0 to float
+  %ab5 = uitofp i16 %b1 to float
+  %ab6 = sitofp  i8 %c0 to float
+  %ab7 = uitofp  i8 %c1 to float
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
new file mode 100644
index 00000000000..de7c59286ac
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x float> @fadd_fsub_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @fadd_fsub_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <8 x float> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %ab0 = fadd float %a0, %b0
+  %ab1 = fsub float %a1, %b1
+  %ab2 = fsub float %a2, %b2
+  %ab3 = fadd float %a3, %b3
+  %ab4 = fadd float %a4, %b4
+  %ab5 = fsub float %a5, %b5
+  %ab6 = fsub float %a6, %b6
+  %ab7 = fadd float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @fmul_fdiv_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R7]]
+;
+; SLM-LABEL: @fmul_fdiv_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]]
+; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]]
+; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> <i32 4, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 undef, i32 undef, i32 undef>
+; SLM-NEXT:    [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 9, i32 10, i32 undef>
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; SLM-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX-LABEL: @fmul_fdiv_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; AVX-NEXT:    ret <8 x float> [[R7]]
+;
+; AVX512-LABEL: @fmul_fdiv_v8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = fdiv <8 x float> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 13, i32 14, i32 7>
+; AVX512-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %ab0 = fmul float %a0, %b0
+  %ab1 = fdiv float %a1, %b1
+  %ab2 = fdiv float %a2, %b2
+  %ab3 = fmul float %a3, %b3
+  %ab4 = fmul float %a4, %b4
+  %ab5 = fdiv float %a5, %b5
+  %ab6 = fdiv float %a6, %b6
+  %ab7 = fmul float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <8 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <8 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <8 x float>   %r2, float %ab3, i32 3
+  %r4 = insertelement <8 x float>   %r3, float %ab4, i32 4
+  %r5 = insertelement <8 x float>   %r4, float %ab5, i32 5
+  %r6 = insertelement <8 x float>   %r5, float %ab6, i32 6
+  %r7 = insertelement <8 x float>   %r6, float %ab7, i32 7
+  ret <8 x float> %r7
+}
+
+define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
+; SSE-LABEL: @fmul_fdiv_v4f32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    ret <4 x float> [[TMP1]]
+;
+; SLM-LABEL: @fmul_fdiv_v4f32_const(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    ret <4 x float> [[R3]]
+;
+; AVX-LABEL: @fmul_fdiv_v4f32_const(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    ret <4 x float> [[TMP1]]
+;
+; AVX512-LABEL: @fmul_fdiv_v4f32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    ret <4 x float> [[TMP1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %ab0 = fmul float %a0, 2.0
+  %ab1 = fmul float %a1, 1.0
+  %ab2 = fdiv float %a2, 1.0
+  %ab3 = fdiv float %a3, 0.5
+  %r0 = insertelement <4 x float> undef, float %ab0, i32 0
+  %r1 = insertelement <4 x float>   %r0, float %ab1, i32 1
+  %r2 = insertelement <4 x float>   %r1, float %ab2, i32 2
+  %r3 = insertelement <4 x float>   %r2, float %ab3, i32 3
+  ret <4 x float> %r3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
new file mode 100644
index 00000000000..44729b4a8d5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -0,0 +1,571 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+define <8 x i32> @add_sub_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @add_sub_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub <8 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = add i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = add i32 %a2, %b2
+  %ab3 = add i32 %a3, %b3
+  %ab4 = sub i32 %a4, %b4
+  %ab5 = sub i32 %a5, %b5
+  %ab6 = sub i32 %a6, %b6
+  %ab7 = sub i32 %a7, %b7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_and_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[R3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %ab0 = add i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = and i32 %a2, %b2
+  %ab3 = and i32 %a3, %b3
+  %r0 = insertelement <4 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <4 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <4 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <4 x i32>   %r2, i32 %ab3, i32 3
+  ret <4 x i32> %r3
+}
+
+define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; SSE-LABEL: @add_mul_v4i32(
+; SSE-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; SSE-NEXT:    ret <4 x i32> [[R3]]
+;
+; SLM-LABEL: @add_mul_v4i32(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3
+; SLM-NEXT:    [[AB0:%.*]] = mul i32 [[A0]], [[B0]]
+; SLM-NEXT:    [[AB1:%.*]] = add i32 [[A1]], [[B1]]
+; SLM-NEXT:    [[AB2:%.*]] = add i32 [[A2]], [[B2]]
+; SLM-NEXT:    [[AB3:%.*]] = mul i32 [[A3]], [[B3]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[R3]]
+;
+; AVX-LABEL: @add_mul_v4i32(
+; AVX-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; AVX-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT:    ret <4 x i32> [[R3]]
+;
+; AVX512-LABEL: @add_mul_v4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX512-NEXT:    ret <4 x i32> [[R3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %ab0 = mul i32 %a0, %b0
+  %ab1 = add i32 %a1, %b1
+  %ab2 = add i32 %a2, %b2
+  %ab3 = mul i32 %a3, %b3
+  %r0 = insertelement <4 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <4 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <4 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <4 x i32>   %r2, i32 %ab3, i32 3
+  ret <4 x i32> %r3
+}
+
+define <8 x i32> @ashr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @ashr_shl_v8i32(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[AB2:%.*]] = ashr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[AB3:%.*]] = ashr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @ashr_shl_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; SLM-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @ashr_shl_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = ashr i32 %a0, %b0
+  %ab1 = ashr i32 %a1, %b1
+  %ab2 = ashr i32 %a2, %b2
+  %ab3 = ashr i32 %a3, %b3
+  %ab4 = shl  i32 %a4, %b4
+  %ab5 = shl  i32 %a5, %b5
+  %ab6 = shl  i32 %a6, %b6
+  %ab7 = shl  i32 %a7, %b7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
+; SSE-LABEL: @ashr_shl_v8i32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @ashr_shl_v8i32_const(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_shl_v8i32_const(
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX1-NEXT:    [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
+; AVX1-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+; AVX1-NEXT:    [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_shl_v8i32_const(
+; AVX2-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX2-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_shl_v8i32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shl <8 x i32> [[A]], <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = ashr i32 %a0, 2
+  %ab1 = ashr i32 %a1, 2
+  %ab2 = ashr i32 %a2, 2
+  %ab3 = ashr i32 %a3, 2
+  %ab4 = shl  i32 %a4, 3
+  %ab5 = shl  i32 %a5, 3
+  %ab6 = shl  i32 %a6, 3
+  %ab7 = shl  i32 %a7, 3
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @ashr_lshr_shl_v8i32(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SSE-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; SSE-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SSE-NEXT:    [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; SSE-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[AB2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[AB3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @ashr_lshr_shl_v8i32(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SLM-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SLM-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; SLM-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; SLM-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; SLM-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX1-LABEL: @ashr_lshr_shl_v8i32(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
+; AVX1-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; AVX1-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX1-NEXT:    [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0
+; AVX1-NEXT:    [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1
+; AVX1-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
+; AVX1-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX1-NEXT:    [[AB0:%.*]] = ashr i32 [[A0]], [[B0]]
+; AVX1-NEXT:    [[AB1:%.*]] = ashr i32 [[A1]], [[B1]]
+; AVX1-NEXT:    [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX1-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX1-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX1-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; AVX1-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; AVX1-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
+; AVX1-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2
+; AVX1-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
+; AVX1-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
+; AVX1-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
+; AVX1-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5
+; AVX1-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX1-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX2-LABEL: @ashr_lshr_shl_v8i32(
+; AVX2-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; AVX2-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX2-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; AVX2-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX2-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX2-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX2-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX2-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX2-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX2-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX2-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; AVX2-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
+; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; AVX2-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX2-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX2-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX2-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @ashr_lshr_shl_v8i32(
+; AVX512-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; AVX512-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; AVX512-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; AVX512-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]]
+; AVX512-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
+; AVX512-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX512-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX512-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; AVX512-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; AVX512-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4
+; AVX512-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4
+; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5
+; AVX512-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5
+; AVX512-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; AVX512-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %ab0 = ashr i32 %a0, %b0
+  %ab1 = ashr i32 %a1, %b1
+  %ab2 = lshr i32 %a2, %b2
+  %ab3 = lshr i32 %a3, %b3
+  %ab4 = lshr i32 %a4, %b4
+  %ab5 = lshr i32 %a5, %b5
+  %ab6 = shl  i32 %a6, %b6
+  %ab7 = shl  i32 %a7, %b7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @add_v8i32_undefs(<8 x i32> %a) {
+; CHECK-LABEL: @add_v8i32_undefs(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A:%.*]], <i32 undef, i32 4, i32 8, i32 16, i32 undef, i32 4, i32 8, i32 16>
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = add i32 %a0, undef
+  %ab1 = add i32 %a1, 4
+  %ab2 = add i32 %a2, 8
+  %ab3 = add i32 %a3, 16
+  %ab4 = add i32 %a4, undef
+  %ab5 = add i32 %a5, 4
+  %ab6 = add i32 %a6, 8
+  %ab7 = add i32 %a7, 16
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) {
+; CHECK-LABEL: @sdiv_v8i32_undefs(
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
+; CHECK-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; CHECK-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
+; CHECK-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
+; CHECK-NEXT:    [[AB1:%.*]] = sdiv i32 [[A1]], 4
+; CHECK-NEXT:    [[AB2:%.*]] = sdiv i32 [[A2]], 8
+; CHECK-NEXT:    [[AB3:%.*]] = sdiv i32 [[A3]], 16
+; CHECK-NEXT:    [[AB5:%.*]] = sdiv i32 [[A5]], 4
+; CHECK-NEXT:    [[AB6:%.*]] = sdiv i32 [[A6]], 8
+; CHECK-NEXT:    [[AB7:%.*]] = sdiv i32 [[A7]], 16
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = sdiv i32 %a0, undef
+  %ab1 = sdiv i32 %a1, 4
+  %ab2 = sdiv i32 %a2, 8
+  %ab3 = sdiv i32 %a3, 16
+  %ab4 = sdiv i32 %a4, undef
+  %ab5 = sdiv i32 %a5, 4
+  %ab6 = sdiv i32 %a6, 8
+  %ab7 = sdiv i32 %a7, 16
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
+
+define <8 x i32> @add_sub_v8i32_splat(<8 x i32> %a, i32 %b) {
+; CHECK-LABEL: @add_sub_v8i32_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[A]]
+; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[R7]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %ab0 = add i32 %a0, %b
+  %ab1 = add i32 %b, %a1
+  %ab2 = add i32 %a2, %b
+  %ab3 = add i32 %b, %a3
+  %ab4 = sub i32 %b, %a4
+  %ab5 = sub i32 %b, %a5
+  %ab6 = sub i32 %b, %a6
+  %ab7 = sub i32 %b, %a7
+  %r0 = insertelement <8 x i32> undef, i32 %ab0, i32 0
+  %r1 = insertelement <8 x i32>   %r0, i32 %ab1, i32 1
+  %r2 = insertelement <8 x i32>   %r1, i32 %ab2, i32 2
+  %r3 = insertelement <8 x i32>   %r2, i32 %ab3, i32 3
+  %r4 = insertelement <8 x i32>   %r3, i32 %ab4, i32 4
+  %r5 = insertelement <8 x i32>   %r4, i32 %ab5, i32 5
+  %r6 = insertelement <8 x i32>   %r5, i32 %ab6, i32 6
+  %r7 = insertelement <8 x i32>   %r6, i32 %ab7, i32 7
+  ret <8 x i32> %r7
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll
new file mode 100644
index 00000000000..b310c15c2b0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-saddo.ll
@@ -0,0 +1,1254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16)
+declare {i8 , i1} @llvm.sadd.with.overflow.i8 (i8 , i8 )
+
+define void @add_v8i64() {
+; CHECK-LABEL: @add_v8i64(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[C0:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; CHECK-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; CHECK-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; CHECK-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; CHECK-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; CHECK-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; CHECK-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %c0 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a0, i64 %b0)
+  %c1 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a1, i64 %b1)
+  %c2 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a2, i64 %b2)
+  %c3 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a3, i64 %b3)
+  %c4 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a4, i64 %b4)
+  %c5 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a5, i64 %b5)
+  %c6 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a6, i64 %b6)
+  %c7 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %a7, i64 %b7)
+  %r0 = extractvalue {i64, i1} %c0, 0
+  %r1 = extractvalue {i64, i1} %c1, 0
+  %r2 = extractvalue {i64, i1} %c2, 0
+  %r3 = extractvalue {i64, i1} %c3, 0
+  %r4 = extractvalue {i64, i1} %c4, 0
+  %r5 = extractvalue {i64, i1} %c5, 0
+  %r6 = extractvalue {i64, i1} %c6, 0
+  %r7 = extractvalue {i64, i1} %c7, 0
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @add_v16i32() {
+; CHECK-LABEL: @add_v16i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[C0:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; CHECK-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; CHECK-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; CHECK-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; CHECK-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; CHECK-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; CHECK-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; CHECK-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; CHECK-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; CHECK-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; CHECK-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; CHECK-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; CHECK-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %c0  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a0 , i32 %b0 )
+  %c1  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a1 , i32 %b1 )
+  %c2  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a2 , i32 %b2 )
+  %c3  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a3 , i32 %b3 )
+  %c4  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a4 , i32 %b4 )
+  %c5  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a5 , i32 %b5 )
+  %c6  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a6 , i32 %b6 )
+  %c7  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a7 , i32 %b7 )
+  %c8  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a8 , i32 %b8 )
+  %c9  = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a9 , i32 %b9 )
+  %c10 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a10, i32 %b10)
+  %c11 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a11, i32 %b11)
+  %c12 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a12, i32 %b12)
+  %c13 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a13, i32 %b13)
+  %c14 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a14, i32 %b14)
+  %c15 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %a15, i32 %b15)
+  %r0  = extractvalue {i32, i1} %c0 , 0
+  %r1  = extractvalue {i32, i1} %c1 , 0
+  %r2  = extractvalue {i32, i1} %c2 , 0
+  %r3  = extractvalue {i32, i1} %c3 , 0
+  %r4  = extractvalue {i32, i1} %c4 , 0
+  %r5  = extractvalue {i32, i1} %c5 , 0
+  %r6  = extractvalue {i32, i1} %c6 , 0
+  %r7  = extractvalue {i32, i1} %c7 , 0
+  %r8  = extractvalue {i32, i1} %c8 , 0
+  %r9  = extractvalue {i32, i1} %c9 , 0
+  %r10 = extractvalue {i32, i1} %c10, 0
+  %r11 = extractvalue {i32, i1} %c11, 0
+  %r12 = extractvalue {i32, i1} %c12, 0
+  %r13 = extractvalue {i32, i1} %c13, 0
+  %r14 = extractvalue {i32, i1} %c14, 0
+  %r15 = extractvalue {i32, i1} %c15, 0
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @add_v32i16() {
+; CHECK-LABEL: @add_v32i16(
+; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[C0:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; CHECK-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; CHECK-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; CHECK-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; CHECK-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; CHECK-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; CHECK-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; CHECK-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; CHECK-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; CHECK-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; CHECK-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; CHECK-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; CHECK-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; CHECK-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; CHECK-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; CHECK-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; CHECK-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; CHECK-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; CHECK-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; CHECK-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; CHECK-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; CHECK-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; CHECK-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; CHECK-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; CHECK-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; CHECK-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; CHECK-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; CHECK-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; CHECK-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; CHECK-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; CHECK-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; CHECK-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %c0  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a0 , i16 %b0 )
+  %c1  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a1 , i16 %b1 )
+  %c2  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a2 , i16 %b2 )
+  %c3  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a3 , i16 %b3 )
+  %c4  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a4 , i16 %b4 )
+  %c5  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a5 , i16 %b5 )
+  %c6  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a6 , i16 %b6 )
+  %c7  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a7 , i16 %b7 )
+  %c8  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a8 , i16 %b8 )
+  %c9  = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a9 , i16 %b9 )
+  %c10 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a10, i16 %b10)
+  %c11 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a11, i16 %b11)
+  %c12 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a12, i16 %b12)
+  %c13 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a13, i16 %b13)
+  %c14 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a14, i16 %b14)
+  %c15 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a15, i16 %b15)
+  %c16 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a16, i16 %b16)
+  %c17 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a17, i16 %b17)
+  %c18 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a18, i16 %b18)
+  %c19 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a19, i16 %b19)
+  %c20 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a20, i16 %b20)
+  %c21 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a21, i16 %b21)
+  %c22 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a22, i16 %b22)
+  %c23 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a23, i16 %b23)
+  %c24 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a24, i16 %b24)
+  %c25 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a25, i16 %b25)
+  %c26 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a26, i16 %b26)
+  %c27 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a27, i16 %b27)
+  %c28 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a28, i16 %b28)
+  %c29 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a29, i16 %b29)
+  %c30 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a30, i16 %b30)
+  %c31 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %a31, i16 %b31)
+  %r0  = extractvalue {i16, i1} %c0 , 0
+  %r1  = extractvalue {i16, i1} %c1 , 0
+  %r2  = extractvalue {i16, i1} %c2 , 0
+  %r3  = extractvalue {i16, i1} %c3 , 0
+  %r4  = extractvalue {i16, i1} %c4 , 0
+  %r5  = extractvalue {i16, i1} %c5 , 0
+  %r6  = extractvalue {i16, i1} %c6 , 0
+  %r7  = extractvalue {i16, i1} %c7 , 0
+  %r8  = extractvalue {i16, i1} %c8 , 0
+  %r9  = extractvalue {i16, i1} %c9 , 0
+  %r10 = extractvalue {i16, i1} %c10, 0
+  %r11 = extractvalue {i16, i1} %c11, 0
+  %r12 = extractvalue {i16, i1} %c12, 0
+  %r13 = extractvalue {i16, i1} %c13, 0
+  %r14 = extractvalue {i16, i1} %c14, 0
+  %r15 = extractvalue {i16, i1} %c15, 0
+  %r16 = extractvalue {i16, i1} %c16, 0
+  %r17 = extractvalue {i16, i1} %c17, 0
+  %r18 = extractvalue {i16, i1} %c18, 0
+  %r19 = extractvalue {i16, i1} %c19, 0
+  %r20 = extractvalue {i16, i1} %c20, 0
+  %r21 = extractvalue {i16, i1} %c21, 0
+  %r22 = extractvalue {i16, i1} %c22, 0
+  %r23 = extractvalue {i16, i1} %c23, 0
+  %r24 = extractvalue {i16, i1} %c24, 0
+  %r25 = extractvalue {i16, i1} %c25, 0
+  %r26 = extractvalue {i16, i1} %c26, 0
+  %r27 = extractvalue {i16, i1} %c27, 0
+  %r28 = extractvalue {i16, i1} %c28, 0
+  %r29 = extractvalue {i16, i1} %c29, 0
+  %r30 = extractvalue {i16, i1} %c30, 0
+  %r31 = extractvalue {i16, i1} %c31, 0
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @add_v64i8() {
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT:    [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[C0:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT:    [[C32:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT:    [[C33:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT:    [[C34:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT:    [[C35:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT:    [[C36:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT:    [[C37:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT:    [[C38:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT:    [[C39:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT:    [[C40:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT:    [[C41:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT:    [[C42:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT:    [[C43:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT:    [[C44:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT:    [[C45:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT:    [[C46:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT:    [[C47:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT:    [[C48:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT:    [[C49:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT:    [[C50:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT:    [[C51:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT:    [[C52:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT:    [[C53:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT:    [[C54:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT:    [[C55:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT:    [[C56:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT:    [[C57:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT:    [[C58:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT:    [[C59:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT:    [[C60:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT:    [[C61:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT:    [[C62:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT:    [[C63:%.*]] = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT:    [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT:    [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT:    [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT:    [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT:    [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT:    [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT:    [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT:    [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT:    [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT:    [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT:    [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT:    [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT:    [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT:    [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT:    [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT:    [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT:    [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT:    [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT:    [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT:    [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT:    [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT:    [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT:    [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT:    [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT:    [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT:    [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT:    [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT:    [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT:    [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT:    [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT:    [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT:    [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT:    store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1
+; CHECK-NEXT:    store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1
+; CHECK-NEXT:    store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1
+; CHECK-NEXT:    store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+; CHECK-NEXT:    store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+; CHECK-NEXT:    store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+; CHECK-NEXT:    store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+; CHECK-NEXT:    store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+; CHECK-NEXT:    store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+; CHECK-NEXT:    store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+; CHECK-NEXT:    store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+; CHECK-NEXT:    store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+; CHECK-NEXT:    store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+; CHECK-NEXT:    store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+; CHECK-NEXT:    store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+; CHECK-NEXT:    store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+; CHECK-NEXT:    store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+; CHECK-NEXT:    store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+; CHECK-NEXT:    store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+; CHECK-NEXT:    store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+; CHECK-NEXT:    store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+; CHECK-NEXT:    store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+; CHECK-NEXT:    store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+; CHECK-NEXT:    store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+; CHECK-NEXT:    store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+; CHECK-NEXT:    store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+; CHECK-NEXT:    store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+; CHECK-NEXT:    store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+; CHECK-NEXT:    store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+; CHECK-NEXT:    store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+; CHECK-NEXT:    store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+; CHECK-NEXT:    store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+; CHECK-NEXT:    store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+; CHECK-NEXT:    store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+; CHECK-NEXT:    store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+; CHECK-NEXT:    store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+; CHECK-NEXT:    store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+; CHECK-NEXT:    store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+; CHECK-NEXT:    store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+; CHECK-NEXT:    store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+; CHECK-NEXT:    store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+; CHECK-NEXT:    store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+; CHECK-NEXT:    store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+; CHECK-NEXT:    store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+; CHECK-NEXT:    store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+; CHECK-NEXT:    store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+; CHECK-NEXT:    store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+; CHECK-NEXT:    store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+; CHECK-NEXT:    store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+; CHECK-NEXT:    store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+; CHECK-NEXT:    store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+; CHECK-NEXT:    store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+; CHECK-NEXT:    store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+; CHECK-NEXT:    store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+; CHECK-NEXT:    store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+; CHECK-NEXT:    store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+; CHECK-NEXT:    store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %c0  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a0 , i8 %b0 )
+  %c1  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a1 , i8 %b1 )
+  %c2  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a2 , i8 %b2 )
+  %c3  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a3 , i8 %b3 )
+  %c4  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a4 , i8 %b4 )
+  %c5  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a5 , i8 %b5 )
+  %c6  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a6 , i8 %b6 )
+  %c7  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a7 , i8 %b7 )
+  %c8  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a8 , i8 %b8 )
+  %c9  = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a9 , i8 %b9 )
+  %c10 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a10, i8 %b10)
+  %c11 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a11, i8 %b11)
+  %c12 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a12, i8 %b12)
+  %c13 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a13, i8 %b13)
+  %c14 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a14, i8 %b14)
+  %c15 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a15, i8 %b15)
+  %c16 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a16, i8 %b16)
+  %c17 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a17, i8 %b17)
+  %c18 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a18, i8 %b18)
+  %c19 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a19, i8 %b19)
+  %c20 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a20, i8 %b20)
+  %c21 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a21, i8 %b21)
+  %c22 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a22, i8 %b22)
+  %c23 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a23, i8 %b23)
+  %c24 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a24, i8 %b24)
+  %c25 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a25, i8 %b25)
+  %c26 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a26, i8 %b26)
+  %c27 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a27, i8 %b27)
+  %c28 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a28, i8 %b28)
+  %c29 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a29, i8 %b29)
+  %c30 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a30, i8 %b30)
+  %c31 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a31, i8 %b31)
+  %c32 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a32, i8 %b32)
+  %c33 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a33, i8 %b33)
+  %c34 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a34, i8 %b34)
+  %c35 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a35, i8 %b35)
+  %c36 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a36, i8 %b36)
+  %c37 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a37, i8 %b37)
+  %c38 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a38, i8 %b38)
+  %c39 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a39, i8 %b39)
+  %c40 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a40, i8 %b40)
+  %c41 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a41, i8 %b41)
+  %c42 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a42, i8 %b42)
+  %c43 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a43, i8 %b43)
+  %c44 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a44, i8 %b44)
+  %c45 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a45, i8 %b45)
+  %c46 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a46, i8 %b46)
+  %c47 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a47, i8 %b47)
+  %c48 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a48, i8 %b48)
+  %c49 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a49, i8 %b49)
+  %c50 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a50, i8 %b50)
+  %c51 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a51, i8 %b51)
+  %c52 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a52, i8 %b52)
+  %c53 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a53, i8 %b53)
+  %c54 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a54, i8 %b54)
+  %c55 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a55, i8 %b55)
+  %c56 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a56, i8 %b56)
+  %c57 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a57, i8 %b57)
+  %c58 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a58, i8 %b58)
+  %c59 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a59, i8 %b59)
+  %c60 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a60, i8 %b60)
+  %c61 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a61, i8 %b61)
+  %c62 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a62, i8 %b62)
+  %c63 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a63, i8 %b63)
+  %r0  = extractvalue {i8, i1} %c0 , 0
+  %r1  = extractvalue {i8, i1} %c1 , 0
+  %r2  = extractvalue {i8, i1} %c2 , 0
+  %r3  = extractvalue {i8, i1} %c3 , 0
+  %r4  = extractvalue {i8, i1} %c4 , 0
+  %r5  = extractvalue {i8, i1} %c5 , 0
+  %r6  = extractvalue {i8, i1} %c6 , 0
+  %r7  = extractvalue {i8, i1} %c7 , 0
+  %r8  = extractvalue {i8, i1} %c8 , 0
+  %r9  = extractvalue {i8, i1} %c9 , 0
+  %r10 = extractvalue {i8, i1} %c10, 0
+  %r11 = extractvalue {i8, i1} %c11, 0
+  %r12 = extractvalue {i8, i1} %c12, 0
+  %r13 = extractvalue {i8, i1} %c13, 0
+  %r14 = extractvalue {i8, i1} %c14, 0
+  %r15 = extractvalue {i8, i1} %c15, 0
+  %r16 = extractvalue {i8, i1} %c16, 0
+  %r17 = extractvalue {i8, i1} %c17, 0
+  %r18 = extractvalue {i8, i1} %c18, 0
+  %r19 = extractvalue {i8, i1} %c19, 0
+  %r20 = extractvalue {i8, i1} %c20, 0
+  %r21 = extractvalue {i8, i1} %c21, 0
+  %r22 = extractvalue {i8, i1} %c22, 0
+  %r23 = extractvalue {i8, i1} %c23, 0
+  %r24 = extractvalue {i8, i1} %c24, 0
+  %r25 = extractvalue {i8, i1} %c25, 0
+  %r26 = extractvalue {i8, i1} %c26, 0
+  %r27 = extractvalue {i8, i1} %c27, 0
+  %r28 = extractvalue {i8, i1} %c28, 0
+  %r29 = extractvalue {i8, i1} %c29, 0
+  %r30 = extractvalue {i8, i1} %c30, 0
+  %r31 = extractvalue {i8, i1} %c31, 0
+  %r32 = extractvalue {i8, i1} %c32, 0
+  %r33 = extractvalue {i8, i1} %c33, 0
+  %r34 = extractvalue {i8, i1} %c34, 0
+  %r35 = extractvalue {i8, i1} %c35, 0
+  %r36 = extractvalue {i8, i1} %c36, 0
+  %r37 = extractvalue {i8, i1} %c37, 0
+  %r38 = extractvalue {i8, i1} %c38, 0
+  %r39 = extractvalue {i8, i1} %c39, 0
+  %r40 = extractvalue {i8, i1} %c40, 0
+  %r41 = extractvalue {i8, i1} %c41, 0
+  %r42 = extractvalue {i8, i1} %c42, 0
+  %r43 = extractvalue {i8, i1} %c43, 0
+  %r44 = extractvalue {i8, i1} %c44, 0
+  %r45 = extractvalue {i8, i1} %c45, 0
+  %r46 = extractvalue {i8, i1} %c46, 0
+  %r47 = extractvalue {i8, i1} %c47, 0
+  %r48 = extractvalue {i8, i1} %c48, 0
+  %r49 = extractvalue {i8, i1} %c49, 0
+  %r50 = extractvalue {i8, i1} %c50, 0
+  %r51 = extractvalue {i8, i1} %c51, 0
+  %r52 = extractvalue {i8, i1} %c52, 0
+  %r53 = extractvalue {i8, i1} %c53, 0
+  %r54 = extractvalue {i8, i1} %c54, 0
+  %r55 = extractvalue {i8, i1} %c55, 0
+  %r56 = extractvalue {i8, i1} %c56, 0
+  %r57 = extractvalue {i8, i1} %c57, 0
+  %r58 = extractvalue {i8, i1} %c58, 0
+  %r59 = extractvalue {i8, i1} %c59, 0
+  %r60 = extractvalue {i8, i1} %c60, 0
+  %r61 = extractvalue {i8, i1} %c61, 0
+  %r62 = extractvalue {i8, i1} %c62, 0
+  %r63 = extractvalue {i8, i1} %c63, 0
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
new file mode 100644
index 00000000000..0bf3ea20d6c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll
@@ -0,0 +1,775 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare i64 @llvm.sadd.sat.i64(i64, i64)
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare i16 @llvm.sadd.sat.i16(i16, i16)
+declare i8  @llvm.sadd.sat.i8 (i8 , i8 )
+
+define void @add_v8i64() {
+; SSE-LABEL: @add_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v8i64(
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.sadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    ret void
+;
+; AVX1-LABEL: @add_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @add_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256BW-LABEL: @add_v8i64(
+; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = call i64 @llvm.sadd.sat.i64(i64 %a0, i64 %b0)
+  %r1 = call i64 @llvm.sadd.sat.i64(i64 %a1, i64 %b1)
+  %r2 = call i64 @llvm.sadd.sat.i64(i64 %a2, i64 %b2)
+  %r3 = call i64 @llvm.sadd.sat.i64(i64 %a3, i64 %b3)
+  %r4 = call i64 @llvm.sadd.sat.i64(i64 %a4, i64 %b4)
+  %r5 = call i64 @llvm.sadd.sat.i64(i64 %a5, i64 %b5)
+  %r6 = call i64 @llvm.sadd.sat.i64(i64 %a6, i64 %b6)
+  %r7 = call i64 @llvm.sadd.sat.i64(i64 %a7, i64 %b7)
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @add_v16i32() {
+; SSE-LABEL: @add_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = call i32 @llvm.sadd.sat.i32(i32 %a0 , i32 %b0 )
+  %r1  = call i32 @llvm.sadd.sat.i32(i32 %a1 , i32 %b1 )
+  %r2  = call i32 @llvm.sadd.sat.i32(i32 %a2 , i32 %b2 )
+  %r3  = call i32 @llvm.sadd.sat.i32(i32 %a3 , i32 %b3 )
+  %r4  = call i32 @llvm.sadd.sat.i32(i32 %a4 , i32 %b4 )
+  %r5  = call i32 @llvm.sadd.sat.i32(i32 %a5 , i32 %b5 )
+  %r6  = call i32 @llvm.sadd.sat.i32(i32 %a6 , i32 %b6 )
+  %r7  = call i32 @llvm.sadd.sat.i32(i32 %a7 , i32 %b7 )
+  %r8  = call i32 @llvm.sadd.sat.i32(i32 %a8 , i32 %b8 )
+  %r9  = call i32 @llvm.sadd.sat.i32(i32 %a9 , i32 %b9 )
+  %r10 = call i32 @llvm.sadd.sat.i32(i32 %a10, i32 %b10)
+  %r11 = call i32 @llvm.sadd.sat.i32(i32 %a11, i32 %b11)
+  %r12 = call i32 @llvm.sadd.sat.i32(i32 %a12, i32 %b12)
+  %r13 = call i32 @llvm.sadd.sat.i32(i32 %a13, i32 %b13)
+  %r14 = call i32 @llvm.sadd.sat.i32(i32 %a14, i32 %b14)
+  %r15 = call i32 @llvm.sadd.sat.i32(i32 %a15, i32 %b15)
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @add_v32i16() {
+; SSE-LABEL: @add_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX512-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = call i16 @llvm.sadd.sat.i16(i16 %a0 , i16 %b0 )
+  %r1  = call i16 @llvm.sadd.sat.i16(i16 %a1 , i16 %b1 )
+  %r2  = call i16 @llvm.sadd.sat.i16(i16 %a2 , i16 %b2 )
+  %r3  = call i16 @llvm.sadd.sat.i16(i16 %a3 , i16 %b3 )
+  %r4  = call i16 @llvm.sadd.sat.i16(i16 %a4 , i16 %b4 )
+  %r5  = call i16 @llvm.sadd.sat.i16(i16 %a5 , i16 %b5 )
+  %r6  = call i16 @llvm.sadd.sat.i16(i16 %a6 , i16 %b6 )
+  %r7  = call i16 @llvm.sadd.sat.i16(i16 %a7 , i16 %b7 )
+  %r8  = call i16 @llvm.sadd.sat.i16(i16 %a8 , i16 %b8 )
+  %r9  = call i16 @llvm.sadd.sat.i16(i16 %a9 , i16 %b9 )
+  %r10 = call i16 @llvm.sadd.sat.i16(i16 %a10, i16 %b10)
+  %r11 = call i16 @llvm.sadd.sat.i16(i16 %a11, i16 %b11)
+  %r12 = call i16 @llvm.sadd.sat.i16(i16 %a12, i16 %b12)
+  %r13 = call i16 @llvm.sadd.sat.i16(i16 %a13, i16 %b13)
+  %r14 = call i16 @llvm.sadd.sat.i16(i16 %a14, i16 %b14)
+  %r15 = call i16 @llvm.sadd.sat.i16(i16 %a15, i16 %b15)
+  %r16 = call i16 @llvm.sadd.sat.i16(i16 %a16, i16 %b16)
+  %r17 = call i16 @llvm.sadd.sat.i16(i16 %a17, i16 %b17)
+  %r18 = call i16 @llvm.sadd.sat.i16(i16 %a18, i16 %b18)
+  %r19 = call i16 @llvm.sadd.sat.i16(i16 %a19, i16 %b19)
+  %r20 = call i16 @llvm.sadd.sat.i16(i16 %a20, i16 %b20)
+  %r21 = call i16 @llvm.sadd.sat.i16(i16 %a21, i16 %b21)
+  %r22 = call i16 @llvm.sadd.sat.i16(i16 %a22, i16 %b22)
+  %r23 = call i16 @llvm.sadd.sat.i16(i16 %a23, i16 %b23)
+  %r24 = call i16 @llvm.sadd.sat.i16(i16 %a24, i16 %b24)
+  %r25 = call i16 @llvm.sadd.sat.i16(i16 %a25, i16 %b25)
+  %r26 = call i16 @llvm.sadd.sat.i16(i16 %a26, i16 %b26)
+  %r27 = call i16 @llvm.sadd.sat.i16(i16 %a27, i16 %b27)
+  %r28 = call i16 @llvm.sadd.sat.i16(i16 %a28, i16 %b28)
+  %r29 = call i16 @llvm.sadd.sat.i16(i16 %a29, i16 %b29)
+  %r30 = call i16 @llvm.sadd.sat.i16(i16 %a30, i16 %b30)
+  %r31 = call i16 @llvm.sadd.sat.i16(i16 %a31, i16 %b31)
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @add_v64i8() {
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = call i8 @llvm.sadd.sat.i8(i8 %a0 , i8 %b0 )
+  %r1  = call i8 @llvm.sadd.sat.i8(i8 %a1 , i8 %b1 )
+  %r2  = call i8 @llvm.sadd.sat.i8(i8 %a2 , i8 %b2 )
+  %r3  = call i8 @llvm.sadd.sat.i8(i8 %a3 , i8 %b3 )
+  %r4  = call i8 @llvm.sadd.sat.i8(i8 %a4 , i8 %b4 )
+  %r5  = call i8 @llvm.sadd.sat.i8(i8 %a5 , i8 %b5 )
+  %r6  = call i8 @llvm.sadd.sat.i8(i8 %a6 , i8 %b6 )
+  %r7  = call i8 @llvm.sadd.sat.i8(i8 %a7 , i8 %b7 )
+  %r8  = call i8 @llvm.sadd.sat.i8(i8 %a8 , i8 %b8 )
+  %r9  = call i8 @llvm.sadd.sat.i8(i8 %a9 , i8 %b9 )
+  %r10 = call i8 @llvm.sadd.sat.i8(i8 %a10, i8 %b10)
+  %r11 = call i8 @llvm.sadd.sat.i8(i8 %a11, i8 %b11)
+  %r12 = call i8 @llvm.sadd.sat.i8(i8 %a12, i8 %b12)
+  %r13 = call i8 @llvm.sadd.sat.i8(i8 %a13, i8 %b13)
+  %r14 = call i8 @llvm.sadd.sat.i8(i8 %a14, i8 %b14)
+  %r15 = call i8 @llvm.sadd.sat.i8(i8 %a15, i8 %b15)
+  %r16 = call i8 @llvm.sadd.sat.i8(i8 %a16, i8 %b16)
+  %r17 = call i8 @llvm.sadd.sat.i8(i8 %a17, i8 %b17)
+  %r18 = call i8 @llvm.sadd.sat.i8(i8 %a18, i8 %b18)
+  %r19 = call i8 @llvm.sadd.sat.i8(i8 %a19, i8 %b19)
+  %r20 = call i8 @llvm.sadd.sat.i8(i8 %a20, i8 %b20)
+  %r21 = call i8 @llvm.sadd.sat.i8(i8 %a21, i8 %b21)
+  %r22 = call i8 @llvm.sadd.sat.i8(i8 %a22, i8 %b22)
+  %r23 = call i8 @llvm.sadd.sat.i8(i8 %a23, i8 %b23)
+  %r24 = call i8 @llvm.sadd.sat.i8(i8 %a24, i8 %b24)
+  %r25 = call i8 @llvm.sadd.sat.i8(i8 %a25, i8 %b25)
+  %r26 = call i8 @llvm.sadd.sat.i8(i8 %a26, i8 %b26)
+  %r27 = call i8 @llvm.sadd.sat.i8(i8 %a27, i8 %b27)
+  %r28 = call i8 @llvm.sadd.sat.i8(i8 %a28, i8 %b28)
+  %r29 = call i8 @llvm.sadd.sat.i8(i8 %a29, i8 %b29)
+  %r30 = call i8 @llvm.sadd.sat.i8(i8 %a30, i8 %b30)
+  %r31 = call i8 @llvm.sadd.sat.i8(i8 %a31, i8 %b31)
+  %r32 = call i8 @llvm.sadd.sat.i8(i8 %a32, i8 %b32)
+  %r33 = call i8 @llvm.sadd.sat.i8(i8 %a33, i8 %b33)
+  %r34 = call i8 @llvm.sadd.sat.i8(i8 %a34, i8 %b34)
+  %r35 = call i8 @llvm.sadd.sat.i8(i8 %a35, i8 %b35)
+  %r36 = call i8 @llvm.sadd.sat.i8(i8 %a36, i8 %b36)
+  %r37 = call i8 @llvm.sadd.sat.i8(i8 %a37, i8 %b37)
+  %r38 = call i8 @llvm.sadd.sat.i8(i8 %a38, i8 %b38)
+  %r39 = call i8 @llvm.sadd.sat.i8(i8 %a39, i8 %b39)
+  %r40 = call i8 @llvm.sadd.sat.i8(i8 %a40, i8 %b40)
+  %r41 = call i8 @llvm.sadd.sat.i8(i8 %a41, i8 %b41)
+  %r42 = call i8 @llvm.sadd.sat.i8(i8 %a42, i8 %b42)
+  %r43 = call i8 @llvm.sadd.sat.i8(i8 %a43, i8 %b43)
+  %r44 = call i8 @llvm.sadd.sat.i8(i8 %a44, i8 %b44)
+  %r45 = call i8 @llvm.sadd.sat.i8(i8 %a45, i8 %b45)
+  %r46 = call i8 @llvm.sadd.sat.i8(i8 %a46, i8 %b46)
+  %r47 = call i8 @llvm.sadd.sat.i8(i8 %a47, i8 %b47)
+  %r48 = call i8 @llvm.sadd.sat.i8(i8 %a48, i8 %b48)
+  %r49 = call i8 @llvm.sadd.sat.i8(i8 %a49, i8 %b49)
+  %r50 = call i8 @llvm.sadd.sat.i8(i8 %a50, i8 %b50)
+  %r51 = call i8 @llvm.sadd.sat.i8(i8 %a51, i8 %b51)
+  %r52 = call i8 @llvm.sadd.sat.i8(i8 %a52, i8 %b52)
+  %r53 = call i8 @llvm.sadd.sat.i8(i8 %a53, i8 %b53)
+  %r54 = call i8 @llvm.sadd.sat.i8(i8 %a54, i8 %b54)
+  %r55 = call i8 @llvm.sadd.sat.i8(i8 %a55, i8 %b55)
+  %r56 = call i8 @llvm.sadd.sat.i8(i8 %a56, i8 %b56)
+  %r57 = call i8 @llvm.sadd.sat.i8(i8 %a57, i8 %b57)
+  %r58 = call i8 @llvm.sadd.sat.i8(i8 %a58, i8 %b58)
+  %r59 = call i8 @llvm.sadd.sat.i8(i8 %a59, i8 %b59)
+  %r60 = call i8 @llvm.sadd.sat.i8(i8 %a60, i8 %b60)
+  %r61 = call i8 @llvm.sadd.sat.i8(i8 %a61, i8 %b61)
+  %r62 = call i8 @llvm.sadd.sat.i8(i8 %a62, i8 %b62)
+  %r63 = call i8 @llvm.sadd.sat.i8(i8 %a63, i8 %b63)
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll
new file mode 100644
index 00000000000..b057121a275
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-uaddo.ll
@@ -0,0 +1,1254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
+declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16)
+declare {i8 , i1} @llvm.uadd.with.overflow.i8 (i8 , i8 )
+
+define void @add_v8i64() {
+; CHECK-LABEL: @add_v8i64(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[C0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; CHECK-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; CHECK-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; CHECK-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; CHECK-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; CHECK-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; CHECK-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %c0 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a0, i64 %b0)
+  %c1 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a1, i64 %b1)
+  %c2 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a2, i64 %b2)
+  %c3 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a3, i64 %b3)
+  %c4 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a4, i64 %b4)
+  %c5 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a5, i64 %b5)
+  %c6 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a6, i64 %b6)
+  %c7 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a7, i64 %b7)
+  %r0 = extractvalue {i64, i1} %c0, 0
+  %r1 = extractvalue {i64, i1} %c1, 0
+  %r2 = extractvalue {i64, i1} %c2, 0
+  %r3 = extractvalue {i64, i1} %c3, 0
+  %r4 = extractvalue {i64, i1} %c4, 0
+  %r5 = extractvalue {i64, i1} %c5, 0
+  %r6 = extractvalue {i64, i1} %c6, 0
+  %r7 = extractvalue {i64, i1} %c7, 0
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @add_v16i32() {
+; CHECK-LABEL: @add_v16i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[C0:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; CHECK-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; CHECK-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; CHECK-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; CHECK-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; CHECK-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; CHECK-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; CHECK-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; CHECK-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; CHECK-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; CHECK-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; CHECK-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; CHECK-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %c0  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a0 , i32 %b0 )
+  %c1  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a1 , i32 %b1 )
+  %c2  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a2 , i32 %b2 )
+  %c3  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a3 , i32 %b3 )
+  %c4  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a4 , i32 %b4 )
+  %c5  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a5 , i32 %b5 )
+  %c6  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a6 , i32 %b6 )
+  %c7  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a7 , i32 %b7 )
+  %c8  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a8 , i32 %b8 )
+  %c9  = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a9 , i32 %b9 )
+  %c10 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a10, i32 %b10)
+  %c11 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a11, i32 %b11)
+  %c12 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a12, i32 %b12)
+  %c13 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a13, i32 %b13)
+  %c14 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a14, i32 %b14)
+  %c15 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a15, i32 %b15)
+  %r0  = extractvalue {i32, i1} %c0 , 0
+  %r1  = extractvalue {i32, i1} %c1 , 0
+  %r2  = extractvalue {i32, i1} %c2 , 0
+  %r3  = extractvalue {i32, i1} %c3 , 0
+  %r4  = extractvalue {i32, i1} %c4 , 0
+  %r5  = extractvalue {i32, i1} %c5 , 0
+  %r6  = extractvalue {i32, i1} %c6 , 0
+  %r7  = extractvalue {i32, i1} %c7 , 0
+  %r8  = extractvalue {i32, i1} %c8 , 0
+  %r9  = extractvalue {i32, i1} %c9 , 0
+  %r10 = extractvalue {i32, i1} %c10, 0
+  %r11 = extractvalue {i32, i1} %c11, 0
+  %r12 = extractvalue {i32, i1} %c12, 0
+  %r13 = extractvalue {i32, i1} %c13, 0
+  %r14 = extractvalue {i32, i1} %c14, 0
+  %r15 = extractvalue {i32, i1} %c15, 0
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @add_v32i16() {
+; CHECK-LABEL: @add_v32i16(
+; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[C0:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; CHECK-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; CHECK-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; CHECK-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; CHECK-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; CHECK-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; CHECK-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; CHECK-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; CHECK-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; CHECK-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; CHECK-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; CHECK-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; CHECK-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; CHECK-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; CHECK-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; CHECK-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; CHECK-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; CHECK-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; CHECK-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; CHECK-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; CHECK-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; CHECK-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; CHECK-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; CHECK-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; CHECK-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; CHECK-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; CHECK-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; CHECK-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; CHECK-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; CHECK-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; CHECK-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; CHECK-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %c0  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a0 , i16 %b0 )
+  %c1  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a1 , i16 %b1 )
+  %c2  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a2 , i16 %b2 )
+  %c3  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a3 , i16 %b3 )
+  %c4  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a4 , i16 %b4 )
+  %c5  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a5 , i16 %b5 )
+  %c6  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a6 , i16 %b6 )
+  %c7  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a7 , i16 %b7 )
+  %c8  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a8 , i16 %b8 )
+  %c9  = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a9 , i16 %b9 )
+  %c10 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a10, i16 %b10)
+  %c11 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a11, i16 %b11)
+  %c12 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a12, i16 %b12)
+  %c13 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a13, i16 %b13)
+  %c14 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a14, i16 %b14)
+  %c15 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a15, i16 %b15)
+  %c16 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a16, i16 %b16)
+  %c17 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a17, i16 %b17)
+  %c18 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a18, i16 %b18)
+  %c19 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a19, i16 %b19)
+  %c20 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a20, i16 %b20)
+  %c21 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a21, i16 %b21)
+  %c22 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a22, i16 %b22)
+  %c23 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a23, i16 %b23)
+  %c24 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a24, i16 %b24)
+  %c25 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a25, i16 %b25)
+  %c26 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a26, i16 %b26)
+  %c27 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a27, i16 %b27)
+  %c28 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a28, i16 %b28)
+  %c29 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a29, i16 %b29)
+  %c30 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a30, i16 %b30)
+  %c31 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %a31, i16 %b31)
+  %r0  = extractvalue {i16, i1} %c0 , 0
+  %r1  = extractvalue {i16, i1} %c1 , 0
+  %r2  = extractvalue {i16, i1} %c2 , 0
+  %r3  = extractvalue {i16, i1} %c3 , 0
+  %r4  = extractvalue {i16, i1} %c4 , 0
+  %r5  = extractvalue {i16, i1} %c5 , 0
+  %r6  = extractvalue {i16, i1} %c6 , 0
+  %r7  = extractvalue {i16, i1} %c7 , 0
+  %r8  = extractvalue {i16, i1} %c8 , 0
+  %r9  = extractvalue {i16, i1} %c9 , 0
+  %r10 = extractvalue {i16, i1} %c10, 0
+  %r11 = extractvalue {i16, i1} %c11, 0
+  %r12 = extractvalue {i16, i1} %c12, 0
+  %r13 = extractvalue {i16, i1} %c13, 0
+  %r14 = extractvalue {i16, i1} %c14, 0
+  %r15 = extractvalue {i16, i1} %c15, 0
+  %r16 = extractvalue {i16, i1} %c16, 0
+  %r17 = extractvalue {i16, i1} %c17, 0
+  %r18 = extractvalue {i16, i1} %c18, 0
+  %r19 = extractvalue {i16, i1} %c19, 0
+  %r20 = extractvalue {i16, i1} %c20, 0
+  %r21 = extractvalue {i16, i1} %c21, 0
+  %r22 = extractvalue {i16, i1} %c22, 0
+  %r23 = extractvalue {i16, i1} %c23, 0
+  %r24 = extractvalue {i16, i1} %c24, 0
+  %r25 = extractvalue {i16, i1} %c25, 0
+  %r26 = extractvalue {i16, i1} %c26, 0
+  %r27 = extractvalue {i16, i1} %c27, 0
+  %r28 = extractvalue {i16, i1} %c28, 0
+  %r29 = extractvalue {i16, i1} %c29, 0
+  %r30 = extractvalue {i16, i1} %c30, 0
+  %r31 = extractvalue {i16, i1} %c31, 0
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @add_v64i8() {
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT:    [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[C0:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT:    [[C32:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT:    [[C33:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT:    [[C34:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT:    [[C35:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT:    [[C36:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT:    [[C37:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT:    [[C38:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT:    [[C39:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT:    [[C40:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT:    [[C41:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT:    [[C42:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT:    [[C43:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT:    [[C44:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT:    [[C45:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT:    [[C46:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT:    [[C47:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT:    [[C48:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT:    [[C49:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT:    [[C50:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT:    [[C51:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT:    [[C52:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT:    [[C53:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT:    [[C54:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT:    [[C55:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT:    [[C56:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT:    [[C57:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT:    [[C58:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT:    [[C59:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT:    [[C60:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT:    [[C61:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT:    [[C62:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT:    [[C63:%.*]] = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT:    [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT:    [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT:    [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT:    [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT:    [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT:    [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT:    [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT:    [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT:    [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT:    [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT:    [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT:    [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT:    [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT:    [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT:    [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT:    [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT:    [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT:    [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT:    [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT:    [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT:    [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT:    [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT:    [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT:    [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT:    [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT:    [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT:    [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT:    [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT:    [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT:    [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT:    [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT:    [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT:    store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1
+; CHECK-NEXT:    store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1
+; CHECK-NEXT:    store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1
+; CHECK-NEXT:    store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+; CHECK-NEXT:    store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+; CHECK-NEXT:    store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+; CHECK-NEXT:    store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+; CHECK-NEXT:    store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+; CHECK-NEXT:    store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+; CHECK-NEXT:    store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+; CHECK-NEXT:    store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+; CHECK-NEXT:    store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+; CHECK-NEXT:    store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+; CHECK-NEXT:    store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+; CHECK-NEXT:    store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+; CHECK-NEXT:    store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+; CHECK-NEXT:    store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+; CHECK-NEXT:    store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+; CHECK-NEXT:    store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+; CHECK-NEXT:    store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+; CHECK-NEXT:    store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+; CHECK-NEXT:    store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+; CHECK-NEXT:    store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+; CHECK-NEXT:    store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+; CHECK-NEXT:    store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+; CHECK-NEXT:    store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+; CHECK-NEXT:    store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+; CHECK-NEXT:    store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+; CHECK-NEXT:    store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+; CHECK-NEXT:    store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+; CHECK-NEXT:    store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+; CHECK-NEXT:    store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+; CHECK-NEXT:    store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+; CHECK-NEXT:    store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+; CHECK-NEXT:    store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+; CHECK-NEXT:    store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+; CHECK-NEXT:    store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+; CHECK-NEXT:    store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+; CHECK-NEXT:    store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+; CHECK-NEXT:    store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+; CHECK-NEXT:    store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+; CHECK-NEXT:    store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+; CHECK-NEXT:    store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+; CHECK-NEXT:    store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+; CHECK-NEXT:    store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+; CHECK-NEXT:    store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+; CHECK-NEXT:    store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+; CHECK-NEXT:    store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+; CHECK-NEXT:    store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+; CHECK-NEXT:    store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+; CHECK-NEXT:    store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+; CHECK-NEXT:    store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+; CHECK-NEXT:    store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+; CHECK-NEXT:    store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+; CHECK-NEXT:    store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+; CHECK-NEXT:    store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+; CHECK-NEXT:    store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %c0  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a0 , i8 %b0 )
+  %c1  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a1 , i8 %b1 )
+  %c2  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a2 , i8 %b2 )
+  %c3  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a3 , i8 %b3 )
+  %c4  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a4 , i8 %b4 )
+  %c5  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a5 , i8 %b5 )
+  %c6  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a6 , i8 %b6 )
+  %c7  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a7 , i8 %b7 )
+  %c8  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a8 , i8 %b8 )
+  %c9  = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a9 , i8 %b9 )
+  %c10 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a10, i8 %b10)
+  %c11 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a11, i8 %b11)
+  %c12 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a12, i8 %b12)
+  %c13 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a13, i8 %b13)
+  %c14 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a14, i8 %b14)
+  %c15 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a15, i8 %b15)
+  %c16 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a16, i8 %b16)
+  %c17 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a17, i8 %b17)
+  %c18 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a18, i8 %b18)
+  %c19 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a19, i8 %b19)
+  %c20 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a20, i8 %b20)
+  %c21 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a21, i8 %b21)
+  %c22 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a22, i8 %b22)
+  %c23 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a23, i8 %b23)
+  %c24 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a24, i8 %b24)
+  %c25 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a25, i8 %b25)
+  %c26 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a26, i8 %b26)
+  %c27 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a27, i8 %b27)
+  %c28 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a28, i8 %b28)
+  %c29 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a29, i8 %b29)
+  %c30 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a30, i8 %b30)
+  %c31 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a31, i8 %b31)
+  %c32 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a32, i8 %b32)
+  %c33 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a33, i8 %b33)
+  %c34 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a34, i8 %b34)
+  %c35 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a35, i8 %b35)
+  %c36 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a36, i8 %b36)
+  %c37 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a37, i8 %b37)
+  %c38 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a38, i8 %b38)
+  %c39 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a39, i8 %b39)
+  %c40 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a40, i8 %b40)
+  %c41 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a41, i8 %b41)
+  %c42 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a42, i8 %b42)
+  %c43 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a43, i8 %b43)
+  %c44 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a44, i8 %b44)
+  %c45 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a45, i8 %b45)
+  %c46 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a46, i8 %b46)
+  %c47 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a47, i8 %b47)
+  %c48 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a48, i8 %b48)
+  %c49 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a49, i8 %b49)
+  %c50 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a50, i8 %b50)
+  %c51 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a51, i8 %b51)
+  %c52 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a52, i8 %b52)
+  %c53 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a53, i8 %b53)
+  %c54 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a54, i8 %b54)
+  %c55 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a55, i8 %b55)
+  %c56 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a56, i8 %b56)
+  %c57 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a57, i8 %b57)
+  %c58 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a58, i8 %b58)
+  %c59 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a59, i8 %b59)
+  %c60 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a60, i8 %b60)
+  %c61 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a61, i8 %b61)
+  %c62 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a62, i8 %b62)
+  %c63 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a63, i8 %b63)
+  %r0  = extractvalue {i8, i1} %c0 , 0
+  %r1  = extractvalue {i8, i1} %c1 , 0
+  %r2  = extractvalue {i8, i1} %c2 , 0
+  %r3  = extractvalue {i8, i1} %c3 , 0
+  %r4  = extractvalue {i8, i1} %c4 , 0
+  %r5  = extractvalue {i8, i1} %c5 , 0
+  %r6  = extractvalue {i8, i1} %c6 , 0
+  %r7  = extractvalue {i8, i1} %c7 , 0
+  %r8  = extractvalue {i8, i1} %c8 , 0
+  %r9  = extractvalue {i8, i1} %c9 , 0
+  %r10 = extractvalue {i8, i1} %c10, 0
+  %r11 = extractvalue {i8, i1} %c11, 0
+  %r12 = extractvalue {i8, i1} %c12, 0
+  %r13 = extractvalue {i8, i1} %c13, 0
+  %r14 = extractvalue {i8, i1} %c14, 0
+  %r15 = extractvalue {i8, i1} %c15, 0
+  %r16 = extractvalue {i8, i1} %c16, 0
+  %r17 = extractvalue {i8, i1} %c17, 0
+  %r18 = extractvalue {i8, i1} %c18, 0
+  %r19 = extractvalue {i8, i1} %c19, 0
+  %r20 = extractvalue {i8, i1} %c20, 0
+  %r21 = extractvalue {i8, i1} %c21, 0
+  %r22 = extractvalue {i8, i1} %c22, 0
+  %r23 = extractvalue {i8, i1} %c23, 0
+  %r24 = extractvalue {i8, i1} %c24, 0
+  %r25 = extractvalue {i8, i1} %c25, 0
+  %r26 = extractvalue {i8, i1} %c26, 0
+  %r27 = extractvalue {i8, i1} %c27, 0
+  %r28 = extractvalue {i8, i1} %c28, 0
+  %r29 = extractvalue {i8, i1} %c29, 0
+  %r30 = extractvalue {i8, i1} %c30, 0
+  %r31 = extractvalue {i8, i1} %c31, 0
+  %r32 = extractvalue {i8, i1} %c32, 0
+  %r33 = extractvalue {i8, i1} %c33, 0
+  %r34 = extractvalue {i8, i1} %c34, 0
+  %r35 = extractvalue {i8, i1} %c35, 0
+  %r36 = extractvalue {i8, i1} %c36, 0
+  %r37 = extractvalue {i8, i1} %c37, 0
+  %r38 = extractvalue {i8, i1} %c38, 0
+  %r39 = extractvalue {i8, i1} %c39, 0
+  %r40 = extractvalue {i8, i1} %c40, 0
+  %r41 = extractvalue {i8, i1} %c41, 0
+  %r42 = extractvalue {i8, i1} %c42, 0
+  %r43 = extractvalue {i8, i1} %c43, 0
+  %r44 = extractvalue {i8, i1} %c44, 0
+  %r45 = extractvalue {i8, i1} %c45, 0
+  %r46 = extractvalue {i8, i1} %c46, 0
+  %r47 = extractvalue {i8, i1} %c47, 0
+  %r48 = extractvalue {i8, i1} %c48, 0
+  %r49 = extractvalue {i8, i1} %c49, 0
+  %r50 = extractvalue {i8, i1} %c50, 0
+  %r51 = extractvalue {i8, i1} %c51, 0
+  %r52 = extractvalue {i8, i1} %c52, 0
+  %r53 = extractvalue {i8, i1} %c53, 0
+  %r54 = extractvalue {i8, i1} %c54, 0
+  %r55 = extractvalue {i8, i1} %c55, 0
+  %r56 = extractvalue {i8, i1} %c56, 0
+  %r57 = extractvalue {i8, i1} %c57, 0
+  %r58 = extractvalue {i8, i1} %c58, 0
+  %r59 = extractvalue {i8, i1} %c59, 0
+  %r60 = extractvalue {i8, i1} %c60, 0
+  %r61 = extractvalue {i8, i1} %c61, 0
+  %r62 = extractvalue {i8, i1} %c62, 0
+  %r63 = extractvalue {i8, i1} %c63, 0
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
new file mode 100644
index 00000000000..4bd84d846e1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@@ -0,0 +1,729 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare i64 @llvm.uadd.sat.i64(i64, i64)
+declare i32 @llvm.uadd.sat.i32(i32, i32)
+declare i16 @llvm.uadd.sat.i16(i16, i16)
+declare i8  @llvm.uadd.sat.i8 (i8 , i8 )
+
+define void @add_v8i64() {
+; SSE-LABEL: @add_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = call i64 @llvm.uadd.sat.i64(i64 %a0, i64 %b0)
+  %r1 = call i64 @llvm.uadd.sat.i64(i64 %a1, i64 %b1)
+  %r2 = call i64 @llvm.uadd.sat.i64(i64 %a2, i64 %b2)
+  %r3 = call i64 @llvm.uadd.sat.i64(i64 %a3, i64 %b3)
+  %r4 = call i64 @llvm.uadd.sat.i64(i64 %a4, i64 %b4)
+  %r5 = call i64 @llvm.uadd.sat.i64(i64 %a5, i64 %b5)
+  %r6 = call i64 @llvm.uadd.sat.i64(i64 %a6, i64 %b6)
+  %r7 = call i64 @llvm.uadd.sat.i64(i64 %a7, i64 %b7)
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @add_v16i32() {
+; SSE-LABEL: @add_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = call i32 @llvm.uadd.sat.i32(i32 %a0 , i32 %b0 )
+  %r1  = call i32 @llvm.uadd.sat.i32(i32 %a1 , i32 %b1 )
+  %r2  = call i32 @llvm.uadd.sat.i32(i32 %a2 , i32 %b2 )
+  %r3  = call i32 @llvm.uadd.sat.i32(i32 %a3 , i32 %b3 )
+  %r4  = call i32 @llvm.uadd.sat.i32(i32 %a4 , i32 %b4 )
+  %r5  = call i32 @llvm.uadd.sat.i32(i32 %a5 , i32 %b5 )
+  %r6  = call i32 @llvm.uadd.sat.i32(i32 %a6 , i32 %b6 )
+  %r7  = call i32 @llvm.uadd.sat.i32(i32 %a7 , i32 %b7 )
+  %r8  = call i32 @llvm.uadd.sat.i32(i32 %a8 , i32 %b8 )
+  %r9  = call i32 @llvm.uadd.sat.i32(i32 %a9 , i32 %b9 )
+  %r10 = call i32 @llvm.uadd.sat.i32(i32 %a10, i32 %b10)
+  %r11 = call i32 @llvm.uadd.sat.i32(i32 %a11, i32 %b11)
+  %r12 = call i32 @llvm.uadd.sat.i32(i32 %a12, i32 %b12)
+  %r13 = call i32 @llvm.uadd.sat.i32(i32 %a13, i32 %b13)
+  %r14 = call i32 @llvm.uadd.sat.i32(i32 %a14, i32 %b14)
+  %r15 = call i32 @llvm.uadd.sat.i32(i32 %a15, i32 %b15)
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @add_v32i16() {
+; SSE-LABEL: @add_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX512-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = call i16 @llvm.uadd.sat.i16(i16 %a0 , i16 %b0 )
+  %r1  = call i16 @llvm.uadd.sat.i16(i16 %a1 , i16 %b1 )
+  %r2  = call i16 @llvm.uadd.sat.i16(i16 %a2 , i16 %b2 )
+  %r3  = call i16 @llvm.uadd.sat.i16(i16 %a3 , i16 %b3 )
+  %r4  = call i16 @llvm.uadd.sat.i16(i16 %a4 , i16 %b4 )
+  %r5  = call i16 @llvm.uadd.sat.i16(i16 %a5 , i16 %b5 )
+  %r6  = call i16 @llvm.uadd.sat.i16(i16 %a6 , i16 %b6 )
+  %r7  = call i16 @llvm.uadd.sat.i16(i16 %a7 , i16 %b7 )
+  %r8  = call i16 @llvm.uadd.sat.i16(i16 %a8 , i16 %b8 )
+  %r9  = call i16 @llvm.uadd.sat.i16(i16 %a9 , i16 %b9 )
+  %r10 = call i16 @llvm.uadd.sat.i16(i16 %a10, i16 %b10)
+  %r11 = call i16 @llvm.uadd.sat.i16(i16 %a11, i16 %b11)
+  %r12 = call i16 @llvm.uadd.sat.i16(i16 %a12, i16 %b12)
+  %r13 = call i16 @llvm.uadd.sat.i16(i16 %a13, i16 %b13)
+  %r14 = call i16 @llvm.uadd.sat.i16(i16 %a14, i16 %b14)
+  %r15 = call i16 @llvm.uadd.sat.i16(i16 %a15, i16 %b15)
+  %r16 = call i16 @llvm.uadd.sat.i16(i16 %a16, i16 %b16)
+  %r17 = call i16 @llvm.uadd.sat.i16(i16 %a17, i16 %b17)
+  %r18 = call i16 @llvm.uadd.sat.i16(i16 %a18, i16 %b18)
+  %r19 = call i16 @llvm.uadd.sat.i16(i16 %a19, i16 %b19)
+  %r20 = call i16 @llvm.uadd.sat.i16(i16 %a20, i16 %b20)
+  %r21 = call i16 @llvm.uadd.sat.i16(i16 %a21, i16 %b21)
+  %r22 = call i16 @llvm.uadd.sat.i16(i16 %a22, i16 %b22)
+  %r23 = call i16 @llvm.uadd.sat.i16(i16 %a23, i16 %b23)
+  %r24 = call i16 @llvm.uadd.sat.i16(i16 %a24, i16 %b24)
+  %r25 = call i16 @llvm.uadd.sat.i16(i16 %a25, i16 %b25)
+  %r26 = call i16 @llvm.uadd.sat.i16(i16 %a26, i16 %b26)
+  %r27 = call i16 @llvm.uadd.sat.i16(i16 %a27, i16 %b27)
+  %r28 = call i16 @llvm.uadd.sat.i16(i16 %a28, i16 %b28)
+  %r29 = call i16 @llvm.uadd.sat.i16(i16 %a29, i16 %b29)
+  %r30 = call i16 @llvm.uadd.sat.i16(i16 %a30, i16 %b30)
+  %r31 = call i16 @llvm.uadd.sat.i16(i16 %a31, i16 %b31)
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @add_v64i8() {
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = call i8 @llvm.uadd.sat.i8(i8 %a0 , i8 %b0 )
+  %r1  = call i8 @llvm.uadd.sat.i8(i8 %a1 , i8 %b1 )
+  %r2  = call i8 @llvm.uadd.sat.i8(i8 %a2 , i8 %b2 )
+  %r3  = call i8 @llvm.uadd.sat.i8(i8 %a3 , i8 %b3 )
+  %r4  = call i8 @llvm.uadd.sat.i8(i8 %a4 , i8 %b4 )
+  %r5  = call i8 @llvm.uadd.sat.i8(i8 %a5 , i8 %b5 )
+  %r6  = call i8 @llvm.uadd.sat.i8(i8 %a6 , i8 %b6 )
+  %r7  = call i8 @llvm.uadd.sat.i8(i8 %a7 , i8 %b7 )
+  %r8  = call i8 @llvm.uadd.sat.i8(i8 %a8 , i8 %b8 )
+  %r9  = call i8 @llvm.uadd.sat.i8(i8 %a9 , i8 %b9 )
+  %r10 = call i8 @llvm.uadd.sat.i8(i8 %a10, i8 %b10)
+  %r11 = call i8 @llvm.uadd.sat.i8(i8 %a11, i8 %b11)
+  %r12 = call i8 @llvm.uadd.sat.i8(i8 %a12, i8 %b12)
+  %r13 = call i8 @llvm.uadd.sat.i8(i8 %a13, i8 %b13)
+  %r14 = call i8 @llvm.uadd.sat.i8(i8 %a14, i8 %b14)
+  %r15 = call i8 @llvm.uadd.sat.i8(i8 %a15, i8 %b15)
+  %r16 = call i8 @llvm.uadd.sat.i8(i8 %a16, i8 %b16)
+  %r17 = call i8 @llvm.uadd.sat.i8(i8 %a17, i8 %b17)
+  %r18 = call i8 @llvm.uadd.sat.i8(i8 %a18, i8 %b18)
+  %r19 = call i8 @llvm.uadd.sat.i8(i8 %a19, i8 %b19)
+  %r20 = call i8 @llvm.uadd.sat.i8(i8 %a20, i8 %b20)
+  %r21 = call i8 @llvm.uadd.sat.i8(i8 %a21, i8 %b21)
+  %r22 = call i8 @llvm.uadd.sat.i8(i8 %a22, i8 %b22)
+  %r23 = call i8 @llvm.uadd.sat.i8(i8 %a23, i8 %b23)
+  %r24 = call i8 @llvm.uadd.sat.i8(i8 %a24, i8 %b24)
+  %r25 = call i8 @llvm.uadd.sat.i8(i8 %a25, i8 %b25)
+  %r26 = call i8 @llvm.uadd.sat.i8(i8 %a26, i8 %b26)
+  %r27 = call i8 @llvm.uadd.sat.i8(i8 %a27, i8 %b27)
+  %r28 = call i8 @llvm.uadd.sat.i8(i8 %a28, i8 %b28)
+  %r29 = call i8 @llvm.uadd.sat.i8(i8 %a29, i8 %b29)
+  %r30 = call i8 @llvm.uadd.sat.i8(i8 %a30, i8 %b30)
+  %r31 = call i8 @llvm.uadd.sat.i8(i8 %a31, i8 %b31)
+  %r32 = call i8 @llvm.uadd.sat.i8(i8 %a32, i8 %b32)
+  %r33 = call i8 @llvm.uadd.sat.i8(i8 %a33, i8 %b33)
+  %r34 = call i8 @llvm.uadd.sat.i8(i8 %a34, i8 %b34)
+  %r35 = call i8 @llvm.uadd.sat.i8(i8 %a35, i8 %b35)
+  %r36 = call i8 @llvm.uadd.sat.i8(i8 %a36, i8 %b36)
+  %r37 = call i8 @llvm.uadd.sat.i8(i8 %a37, i8 %b37)
+  %r38 = call i8 @llvm.uadd.sat.i8(i8 %a38, i8 %b38)
+  %r39 = call i8 @llvm.uadd.sat.i8(i8 %a39, i8 %b39)
+  %r40 = call i8 @llvm.uadd.sat.i8(i8 %a40, i8 %b40)
+  %r41 = call i8 @llvm.uadd.sat.i8(i8 %a41, i8 %b41)
+  %r42 = call i8 @llvm.uadd.sat.i8(i8 %a42, i8 %b42)
+  %r43 = call i8 @llvm.uadd.sat.i8(i8 %a43, i8 %b43)
+  %r44 = call i8 @llvm.uadd.sat.i8(i8 %a44, i8 %b44)
+  %r45 = call i8 @llvm.uadd.sat.i8(i8 %a45, i8 %b45)
+  %r46 = call i8 @llvm.uadd.sat.i8(i8 %a46, i8 %b46)
+  %r47 = call i8 @llvm.uadd.sat.i8(i8 %a47, i8 %b47)
+  %r48 = call i8 @llvm.uadd.sat.i8(i8 %a48, i8 %b48)
+  %r49 = call i8 @llvm.uadd.sat.i8(i8 %a49, i8 %b49)
+  %r50 = call i8 @llvm.uadd.sat.i8(i8 %a50, i8 %b50)
+  %r51 = call i8 @llvm.uadd.sat.i8(i8 %a51, i8 %b51)
+  %r52 = call i8 @llvm.uadd.sat.i8(i8 %a52, i8 %b52)
+  %r53 = call i8 @llvm.uadd.sat.i8(i8 %a53, i8 %b53)
+  %r54 = call i8 @llvm.uadd.sat.i8(i8 %a54, i8 %b54)
+  %r55 = call i8 @llvm.uadd.sat.i8(i8 %a55, i8 %b55)
+  %r56 = call i8 @llvm.uadd.sat.i8(i8 %a56, i8 %b56)
+  %r57 = call i8 @llvm.uadd.sat.i8(i8 %a57, i8 %b57)
+  %r58 = call i8 @llvm.uadd.sat.i8(i8 %a58, i8 %b58)
+  %r59 = call i8 @llvm.uadd.sat.i8(i8 %a59, i8 %b59)
+  %r60 = call i8 @llvm.uadd.sat.i8(i8 %a60, i8 %b60)
+  %r61 = call i8 @llvm.uadd.sat.i8(i8 %a61, i8 %b61)
+  %r62 = call i8 @llvm.uadd.sat.i8(i8 %a62, i8 %b62)
+  %r63 = call i8 @llvm.uadd.sat.i8(i8 %a63, i8 %b63)
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
new file mode 100644
index 00000000000..5c6430c689f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll
@@ -0,0 +1,708 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @add_v8i64() {
+; SSE-LABEL: @add_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = add i64 %a0, %b0
+  %r1 = add i64 %a1, %b1
+  %r2 = add i64 %a2, %b2
+  %r3 = add i64 %a3, %b3
+  %r4 = add i64 %a4, %b4
+  %r5 = add i64 %a5, %b5
+  %r6 = add i64 %a6, %b6
+  %r7 = add i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @add_v16i32() {
+; SSE-LABEL: @add_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = add i32 %a0 , %b0
+  %r1  = add i32 %a1 , %b1
+  %r2  = add i32 %a2 , %b2
+  %r3  = add i32 %a3 , %b3
+  %r4  = add i32 %a4 , %b4
+  %r5  = add i32 %a5 , %b5
+  %r6  = add i32 %a6 , %b6
+  %r7  = add i32 %a7 , %b7
+  %r8  = add i32 %a8 , %b8
+  %r9  = add i32 %a9 , %b9
+  %r10 = add i32 %a10, %b10
+  %r11 = add i32 %a11, %b11
+  %r12 = add i32 %a12, %b12
+  %r13 = add i32 %a13, %b13
+  %r14 = add i32 %a14, %b14
+  %r15 = add i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @add_v32i16() {
+; SSE-LABEL: @add_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @add_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @add_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @add_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = add i16 %a0 , %b0
+  %r1  = add i16 %a1 , %b1
+  %r2  = add i16 %a2 , %b2
+  %r3  = add i16 %a3 , %b3
+  %r4  = add i16 %a4 , %b4
+  %r5  = add i16 %a5 , %b5
+  %r6  = add i16 %a6 , %b6
+  %r7  = add i16 %a7 , %b7
+  %r8  = add i16 %a8 , %b8
+  %r9  = add i16 %a9 , %b9
+  %r10 = add i16 %a10, %b10
+  %r11 = add i16 %a11, %b11
+  %r12 = add i16 %a12, %b12
+  %r13 = add i16 %a13, %b13
+  %r14 = add i16 %a14, %b14
+  %r15 = add i16 %a15, %b15
+  %r16 = add i16 %a16, %b16
+  %r17 = add i16 %a17, %b17
+  %r18 = add i16 %a18, %b18
+  %r19 = add i16 %a19, %b19
+  %r20 = add i16 %a20, %b20
+  %r21 = add i16 %a21, %b21
+  %r22 = add i16 %a22, %b22
+  %r23 = add i16 %a23, %b23
+  %r24 = add i16 %a24, %b24
+  %r25 = add i16 %a25, %b25
+  %r26 = add i16 %a26, %b26
+  %r27 = add i16 %a27, %b27
+  %r28 = add i16 %a28, %b28
+  %r29 = add i16 %a29, %b29
+  %r30 = add i16 %a30, %b30
+  %r31 = add i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @add_v64i8() {
+; CHECK-LABEL: @add_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = add i8 %a0 , %b0
+  %r1  = add i8 %a1 , %b1
+  %r2  = add i8 %a2 , %b2
+  %r3  = add i8 %a3 , %b3
+  %r4  = add i8 %a4 , %b4
+  %r5  = add i8 %a5 , %b5
+  %r6  = add i8 %a6 , %b6
+  %r7  = add i8 %a7 , %b7
+  %r8  = add i8 %a8 , %b8
+  %r9  = add i8 %a9 , %b9
+  %r10 = add i8 %a10, %b10
+  %r11 = add i8 %a11, %b11
+  %r12 = add i8 %a12, %b12
+  %r13 = add i8 %a13, %b13
+  %r14 = add i8 %a14, %b14
+  %r15 = add i8 %a15, %b15
+  %r16 = add i8 %a16, %b16
+  %r17 = add i8 %a17, %b17
+  %r18 = add i8 %a18, %b18
+  %r19 = add i8 %a19, %b19
+  %r20 = add i8 %a20, %b20
+  %r21 = add i8 %a21, %b21
+  %r22 = add i8 %a22, %b22
+  %r23 = add i8 %a23, %b23
+  %r24 = add i8 %a24, %b24
+  %r25 = add i8 %a25, %b25
+  %r26 = add i8 %a26, %b26
+  %r27 = add i8 %a27, %b27
+  %r28 = add i8 %a28, %b28
+  %r29 = add i8 %a29, %b29
+  %r30 = add i8 %a30, %b30
+  %r31 = add i8 %a31, %b31
+  %r32 = add i8 %a32, %b32
+  %r33 = add i8 %a33, %b33
+  %r34 = add i8 %a34, %b34
+  %r35 = add i8 %a35, %b35
+  %r36 = add i8 %a36, %b36
+  %r37 = add i8 %a37, %b37
+  %r38 = add i8 %a38, %b38
+  %r39 = add i8 %a39, %b39
+  %r40 = add i8 %a40, %b40
+  %r41 = add i8 %a41, %b41
+  %r42 = add i8 %a42, %b42
+  %r43 = add i8 %a43, %b43
+  %r44 = add i8 %a44, %b44
+  %r45 = add i8 %a45, %b45
+  %r46 = add i8 %a46, %b46
+  %r47 = add i8 %a47, %b47
+  %r48 = add i8 %a48, %b48
+  %r49 = add i8 %a49, %b49
+  %r50 = add i8 %a50, %b50
+  %r51 = add i8 %a51, %b51
+  %r52 = add i8 %a52, %b52
+  %r53 = add i8 %a53, %b53
+  %r54 = add i8 %a54, %b54
+  %r55 = add i8 %a55, %b55
+  %r56 = add i8 %a56, %b56
+  %r57 = add i8 %a57, %b57
+  %r58 = add i8 %a58, %b58
+  %r59 = add i8 %a59, %b59
+  %r60 = add i8 %a60, %b60
+  %r61 = add i8 %a61, %b61
+  %r62 = add i8 %a62, %b62
+  %r63 = add i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
new file mode 100644
index 00000000000..a8897aac631
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll
@@ -0,0 +1,1765 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare i64 @llvm.smul.fix.i64(i64, i64, i32)
+declare i32 @llvm.smul.fix.i32(i32, i32, i32)
+declare i16 @llvm.smul.fix.i16(i16, i16, i32)
+declare i8  @llvm.smul.fix.i8 (i8 , i8 , i32)
+
+define void @smul_v8i64() {
+; SSE-LABEL: @smul_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @smul_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
+; AVX1-LABEL: @smul_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @smul_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @smul_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256BW-LABEL: @smul_v8i64(
+; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = call i64 @llvm.smul.fix.i64(i64 %a0, i64 %b0, i32 3)
+  %r1 = call i64 @llvm.smul.fix.i64(i64 %a1, i64 %b1, i32 3)
+  %r2 = call i64 @llvm.smul.fix.i64(i64 %a2, i64 %b2, i32 3)
+  %r3 = call i64 @llvm.smul.fix.i64(i64 %a3, i64 %b3, i32 3)
+  %r4 = call i64 @llvm.smul.fix.i64(i64 %a4, i64 %b4, i32 3)
+  %r5 = call i64 @llvm.smul.fix.i64(i64 %a5, i64 %b5, i32 3)
+  %r6 = call i64 @llvm.smul.fix.i64(i64 %a6, i64 %b6, i32 3)
+  %r7 = call i64 @llvm.smul.fix.i64(i64 %a7, i64 %b7, i32 3)
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @smul_v16i32() {
+; SSE-LABEL: @smul_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3)
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @smul_v16i32(
+; SLM-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
+; SLM-NEXT:    [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
+; SLM-NEXT:    [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
+; SLM-NEXT:    [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
+; SLM-NEXT:    [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
+; SLM-NEXT:    [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
+; SLM-NEXT:    [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
+; SLM-NEXT:    [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
+; SLM-NEXT:    [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
+; SLM-NEXT:    [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
+; SLM-NEXT:    [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
+; SLM-NEXT:    [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
+; SLM-NEXT:    [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
+; SLM-NEXT:    [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
+; SLM-NEXT:    [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
+; SLM-NEXT:    [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
+; SLM-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SLM-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SLM-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SLM-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SLM-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SLM-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SLM-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SLM-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SLM-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SLM-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SLM-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SLM-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SLM-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SLM-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SLM-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SLM-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SLM-NEXT:    ret void
+;
+; AVX1-LABEL: @smul_v16i32(
+; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
+; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
+; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
+; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
+; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
+; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
+; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
+; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
+; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
+; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
+; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
+; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
+; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
+; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
+; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.smul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
+; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @smul_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
+; AVX2-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @smul_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; AVX256BW-LABEL: @smul_v16i32(
+; AVX256BW-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
+; AVX256BW-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX256BW-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256BW-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = call i32 @llvm.smul.fix.i32(i32 %a0 , i32 %b0 , i32 3)
+  %r1  = call i32 @llvm.smul.fix.i32(i32 %a1 , i32 %b1 , i32 3)
+  %r2  = call i32 @llvm.smul.fix.i32(i32 %a2 , i32 %b2 , i32 3)
+  %r3  = call i32 @llvm.smul.fix.i32(i32 %a3 , i32 %b3 , i32 3)
+  %r4  = call i32 @llvm.smul.fix.i32(i32 %a4 , i32 %b4 , i32 3)
+  %r5  = call i32 @llvm.smul.fix.i32(i32 %a5 , i32 %b5 , i32 3)
+  %r6  = call i32 @llvm.smul.fix.i32(i32 %a6 , i32 %b6 , i32 3)
+  %r7  = call i32 @llvm.smul.fix.i32(i32 %a7 , i32 %b7 , i32 3)
+  %r8  = call i32 @llvm.smul.fix.i32(i32 %a8 , i32 %b8 , i32 3)
+  %r9  = call i32 @llvm.smul.fix.i32(i32 %a9 , i32 %b9 , i32 3)
+  %r10 = call i32 @llvm.smul.fix.i32(i32 %a10, i32 %b10, i32 3)
+  %r11 = call i32 @llvm.smul.fix.i32(i32 %a11, i32 %b11, i32 3)
+  %r12 = call i32 @llvm.smul.fix.i32(i32 %a12, i32 %b12, i32 3)
+  %r13 = call i32 @llvm.smul.fix.i32(i32 %a13, i32 %b13, i32 3)
+  %r14 = call i32 @llvm.smul.fix.i32(i32 %a14, i32 %b14, i32 3)
+  %r15 = call i32 @llvm.smul.fix.i32(i32 %a15, i32 %b15, i32 3)
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @smul_v32i16() {
+; SSE-LABEL: @smul_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @smul_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @smul_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @smul_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
+; AVX512-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = call i16 @llvm.smul.fix.i16(i16 %a0 , i16 %b0 , i32 3)
+  %r1  = call i16 @llvm.smul.fix.i16(i16 %a1 , i16 %b1 , i32 3)
+  %r2  = call i16 @llvm.smul.fix.i16(i16 %a2 , i16 %b2 , i32 3)
+  %r3  = call i16 @llvm.smul.fix.i16(i16 %a3 , i16 %b3 , i32 3)
+  %r4  = call i16 @llvm.smul.fix.i16(i16 %a4 , i16 %b4 , i32 3)
+  %r5  = call i16 @llvm.smul.fix.i16(i16 %a5 , i16 %b5 , i32 3)
+  %r6  = call i16 @llvm.smul.fix.i16(i16 %a6 , i16 %b6 , i32 3)
+  %r7  = call i16 @llvm.smul.fix.i16(i16 %a7 , i16 %b7 , i32 3)
+  %r8  = call i16 @llvm.smul.fix.i16(i16 %a8 , i16 %b8 , i32 3)
+  %r9  = call i16 @llvm.smul.fix.i16(i16 %a9 , i16 %b9 , i32 3)
+  %r10 = call i16 @llvm.smul.fix.i16(i16 %a10, i16 %b10, i32 3)
+  %r11 = call i16 @llvm.smul.fix.i16(i16 %a11, i16 %b11, i32 3)
+  %r12 = call i16 @llvm.smul.fix.i16(i16 %a12, i16 %b12, i32 3)
+  %r13 = call i16 @llvm.smul.fix.i16(i16 %a13, i16 %b13, i32 3)
+  %r14 = call i16 @llvm.smul.fix.i16(i16 %a14, i16 %b14, i32 3)
+  %r15 = call i16 @llvm.smul.fix.i16(i16 %a15, i16 %b15, i32 3)
+  %r16 = call i16 @llvm.smul.fix.i16(i16 %a16, i16 %b16, i32 3)
+  %r17 = call i16 @llvm.smul.fix.i16(i16 %a17, i16 %b17, i32 3)
+  %r18 = call i16 @llvm.smul.fix.i16(i16 %a18, i16 %b18, i32 3)
+  %r19 = call i16 @llvm.smul.fix.i16(i16 %a19, i16 %b19, i32 3)
+  %r20 = call i16 @llvm.smul.fix.i16(i16 %a20, i16 %b20, i32 3)
+  %r21 = call i16 @llvm.smul.fix.i16(i16 %a21, i16 %b21, i32 3)
+  %r22 = call i16 @llvm.smul.fix.i16(i16 %a22, i16 %b22, i32 3)
+  %r23 = call i16 @llvm.smul.fix.i16(i16 %a23, i16 %b23, i32 3)
+  %r24 = call i16 @llvm.smul.fix.i16(i16 %a24, i16 %b24, i32 3)
+  %r25 = call i16 @llvm.smul.fix.i16(i16 %a25, i16 %b25, i32 3)
+  %r26 = call i16 @llvm.smul.fix.i16(i16 %a26, i16 %b26, i32 3)
+  %r27 = call i16 @llvm.smul.fix.i16(i16 %a27, i16 %b27, i32 3)
+  %r28 = call i16 @llvm.smul.fix.i16(i16 %a28, i16 %b28, i32 3)
+  %r29 = call i16 @llvm.smul.fix.i16(i16 %a29, i16 %b29, i32 3)
+  %r30 = call i16 @llvm.smul.fix.i16(i16 %a30, i16 %b30, i32 3)
+  %r31 = call i16 @llvm.smul.fix.i16(i16 %a31, i16 %b31, i32 3)
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @smul_v64i8() {
+; CHECK-LABEL: @smul_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = call i8 @llvm.smul.fix.i8(i8 %a0 , i8 %b0 , i32 3)
+  %r1  = call i8 @llvm.smul.fix.i8(i8 %a1 , i8 %b1 , i32 3)
+  %r2  = call i8 @llvm.smul.fix.i8(i8 %a2 , i8 %b2 , i32 3)
+  %r3  = call i8 @llvm.smul.fix.i8(i8 %a3 , i8 %b3 , i32 3)
+  %r4  = call i8 @llvm.smul.fix.i8(i8 %a4 , i8 %b4 , i32 3)
+  %r5  = call i8 @llvm.smul.fix.i8(i8 %a5 , i8 %b5 , i32 3)
+  %r6  = call i8 @llvm.smul.fix.i8(i8 %a6 , i8 %b6 , i32 3)
+  %r7  = call i8 @llvm.smul.fix.i8(i8 %a7 , i8 %b7 , i32 3)
+  %r8  = call i8 @llvm.smul.fix.i8(i8 %a8 , i8 %b8 , i32 3)
+  %r9  = call i8 @llvm.smul.fix.i8(i8 %a9 , i8 %b9 , i32 3)
+  %r10 = call i8 @llvm.smul.fix.i8(i8 %a10, i8 %b10, i32 3)
+  %r11 = call i8 @llvm.smul.fix.i8(i8 %a11, i8 %b11, i32 3)
+  %r12 = call i8 @llvm.smul.fix.i8(i8 %a12, i8 %b12, i32 3)
+  %r13 = call i8 @llvm.smul.fix.i8(i8 %a13, i8 %b13, i32 3)
+  %r14 = call i8 @llvm.smul.fix.i8(i8 %a14, i8 %b14, i32 3)
+  %r15 = call i8 @llvm.smul.fix.i8(i8 %a15, i8 %b15, i32 3)
+  %r16 = call i8 @llvm.smul.fix.i8(i8 %a16, i8 %b16, i32 3)
+  %r17 = call i8 @llvm.smul.fix.i8(i8 %a17, i8 %b17, i32 3)
+  %r18 = call i8 @llvm.smul.fix.i8(i8 %a18, i8 %b18, i32 3)
+  %r19 = call i8 @llvm.smul.fix.i8(i8 %a19, i8 %b19, i32 3)
+  %r20 = call i8 @llvm.smul.fix.i8(i8 %a20, i8 %b20, i32 3)
+  %r21 = call i8 @llvm.smul.fix.i8(i8 %a21, i8 %b21, i32 3)
+  %r22 = call i8 @llvm.smul.fix.i8(i8 %a22, i8 %b22, i32 3)
+  %r23 = call i8 @llvm.smul.fix.i8(i8 %a23, i8 %b23, i32 3)
+  %r24 = call i8 @llvm.smul.fix.i8(i8 %a24, i8 %b24, i32 3)
+  %r25 = call i8 @llvm.smul.fix.i8(i8 %a25, i8 %b25, i32 3)
+  %r26 = call i8 @llvm.smul.fix.i8(i8 %a26, i8 %b26, i32 3)
+  %r27 = call i8 @llvm.smul.fix.i8(i8 %a27, i8 %b27, i32 3)
+  %r28 = call i8 @llvm.smul.fix.i8(i8 %a28, i8 %b28, i32 3)
+  %r29 = call i8 @llvm.smul.fix.i8(i8 %a29, i8 %b29, i32 3)
+  %r30 = call i8 @llvm.smul.fix.i8(i8 %a30, i8 %b30, i32 3)
+  %r31 = call i8 @llvm.smul.fix.i8(i8 %a31, i8 %b31, i32 3)
+  %r32 = call i8 @llvm.smul.fix.i8(i8 %a32, i8 %b32, i32 3)
+  %r33 = call i8 @llvm.smul.fix.i8(i8 %a33, i8 %b33, i32 3)
+  %r34 = call i8 @llvm.smul.fix.i8(i8 %a34, i8 %b34, i32 3)
+  %r35 = call i8 @llvm.smul.fix.i8(i8 %a35, i8 %b35, i32 3)
+  %r36 = call i8 @llvm.smul.fix.i8(i8 %a36, i8 %b36, i32 3)
+  %r37 = call i8 @llvm.smul.fix.i8(i8 %a37, i8 %b37, i32 3)
+  %r38 = call i8 @llvm.smul.fix.i8(i8 %a38, i8 %b38, i32 3)
+  %r39 = call i8 @llvm.smul.fix.i8(i8 %a39, i8 %b39, i32 3)
+  %r40 = call i8 @llvm.smul.fix.i8(i8 %a40, i8 %b40, i32 3)
+  %r41 = call i8 @llvm.smul.fix.i8(i8 %a41, i8 %b41, i32 3)
+  %r42 = call i8 @llvm.smul.fix.i8(i8 %a42, i8 %b42, i32 3)
+  %r43 = call i8 @llvm.smul.fix.i8(i8 %a43, i8 %b43, i32 3)
+  %r44 = call i8 @llvm.smul.fix.i8(i8 %a44, i8 %b44, i32 3)
+  %r45 = call i8 @llvm.smul.fix.i8(i8 %a45, i8 %b45, i32 3)
+  %r46 = call i8 @llvm.smul.fix.i8(i8 %a46, i8 %b46, i32 3)
+  %r47 = call i8 @llvm.smul.fix.i8(i8 %a47, i8 %b47, i32 3)
+  %r48 = call i8 @llvm.smul.fix.i8(i8 %a48, i8 %b48, i32 3)
+  %r49 = call i8 @llvm.smul.fix.i8(i8 %a49, i8 %b49, i32 3)
+  %r50 = call i8 @llvm.smul.fix.i8(i8 %a50, i8 %b50, i32 3)
+  %r51 = call i8 @llvm.smul.fix.i8(i8 %a51, i8 %b51, i32 3)
+  %r52 = call i8 @llvm.smul.fix.i8(i8 %a52, i8 %b52, i32 3)
+  %r53 = call i8 @llvm.smul.fix.i8(i8 %a53, i8 %b53, i32 3)
+  %r54 = call i8 @llvm.smul.fix.i8(i8 %a54, i8 %b54, i32 3)
+  %r55 = call i8 @llvm.smul.fix.i8(i8 %a55, i8 %b55, i32 3)
+  %r56 = call i8 @llvm.smul.fix.i8(i8 %a56, i8 %b56, i32 3)
+  %r57 = call i8 @llvm.smul.fix.i8(i8 %a57, i8 %b57, i32 3)
+  %r58 = call i8 @llvm.smul.fix.i8(i8 %a58, i8 %b58, i32 3)
+  %r59 = call i8 @llvm.smul.fix.i8(i8 %a59, i8 %b59, i32 3)
+  %r60 = call i8 @llvm.smul.fix.i8(i8 %a60, i8 %b60, i32 3)
+  %r61 = call i8 @llvm.smul.fix.i8(i8 %a61, i8 %b61, i32 3)
+  %r62 = call i8 @llvm.smul.fix.i8(i8 %a62, i8 %b62, i32 3)
+  %r63 = call i8 @llvm.smul.fix.i8(i8 %a63, i8 %b63, i32 3)
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
+
+declare i64 @llvm.umul.fix.i64(i64, i64, i32)
+declare i32 @llvm.umul.fix.i32(i32, i32, i32)
+declare i16 @llvm.umul.fix.i16(i16, i16, i32)
+declare i8  @llvm.umul.fix.i8 (i8 , i8 , i32)
+
+define void @umul_v8i64() {
+; SSE-LABEL: @umul_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @umul_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
+; AVX1-LABEL: @umul_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3)
+; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3)
+; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3)
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3)
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @umul_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @umul_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umul.fix.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]], i32 3)
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256BW-LABEL: @umul_v8i64(
+; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3)
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3)
+; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = call i64 @llvm.umul.fix.i64(i64 %a0, i64 %b0, i32 3)
+  %r1 = call i64 @llvm.umul.fix.i64(i64 %a1, i64 %b1, i32 3)
+  %r2 = call i64 @llvm.umul.fix.i64(i64 %a2, i64 %b2, i32 3)
+  %r3 = call i64 @llvm.umul.fix.i64(i64 %a3, i64 %b3, i32 3)
+  %r4 = call i64 @llvm.umul.fix.i64(i64 %a4, i64 %b4, i32 3)
+  %r5 = call i64 @llvm.umul.fix.i64(i64 %a5, i64 %b5, i32 3)
+  %r6 = call i64 @llvm.umul.fix.i64(i64 %a6, i64 %b6, i32 3)
+  %r7 = call i64 @llvm.umul.fix.i64(i64 %a7, i64 %b7, i32 3)
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @umul_v16i32() {
+; SSE-LABEL: @umul_v16i32(
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
+; SSE-NEXT:    [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
+; SSE-NEXT:    [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
+; SSE-NEXT:    [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
+; SSE-NEXT:    [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
+; SSE-NEXT:    [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
+; SSE-NEXT:    [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
+; SSE-NEXT:    [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
+; SSE-NEXT:    [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
+; SSE-NEXT:    [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
+; SSE-NEXT:    [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
+; SSE-NEXT:    [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
+; SSE-NEXT:    [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
+; SSE-NEXT:    [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
+; SSE-NEXT:    [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
+; SSE-NEXT:    [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @umul_v16i32(
+; SLM-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SLM-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SLM-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SLM-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SLM-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SLM-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SLM-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SLM-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SLM-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SLM-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SLM-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SLM-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SLM-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SLM-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SLM-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SLM-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SLM-NEXT:    [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
+; SLM-NEXT:    [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
+; SLM-NEXT:    [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
+; SLM-NEXT:    [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
+; SLM-NEXT:    [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
+; SLM-NEXT:    [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
+; SLM-NEXT:    [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
+; SLM-NEXT:    [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
+; SLM-NEXT:    [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
+; SLM-NEXT:    [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
+; SLM-NEXT:    [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
+; SLM-NEXT:    [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
+; SLM-NEXT:    [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
+; SLM-NEXT:    [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
+; SLM-NEXT:    [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
+; SLM-NEXT:    [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
+; SLM-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SLM-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SLM-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SLM-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SLM-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SLM-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SLM-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SLM-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SLM-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SLM-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SLM-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SLM-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SLM-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SLM-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SLM-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SLM-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SLM-NEXT:    ret void
+;
+; AVX1-LABEL: @umul_v16i32(
+; AVX1-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; AVX1-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; AVX1-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; AVX1-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; AVX1-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; AVX1-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; AVX1-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; AVX1-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; AVX1-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; AVX1-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A0]], i32 [[B0]], i32 3)
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A1]], i32 [[B1]], i32 3)
+; AVX1-NEXT:    [[R2:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A2]], i32 [[B2]], i32 3)
+; AVX1-NEXT:    [[R3:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A3]], i32 [[B3]], i32 3)
+; AVX1-NEXT:    [[R4:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A4]], i32 [[B4]], i32 3)
+; AVX1-NEXT:    [[R5:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A5]], i32 [[B5]], i32 3)
+; AVX1-NEXT:    [[R6:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A6]], i32 [[B6]], i32 3)
+; AVX1-NEXT:    [[R7:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A7]], i32 [[B7]], i32 3)
+; AVX1-NEXT:    [[R8:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A8]], i32 [[B8]], i32 3)
+; AVX1-NEXT:    [[R9:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A9]], i32 [[B9]], i32 3)
+; AVX1-NEXT:    [[R10:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A10]], i32 [[B10]], i32 3)
+; AVX1-NEXT:    [[R11:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A11]], i32 [[B11]], i32 3)
+; AVX1-NEXT:    [[R12:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A12]], i32 [[B12]], i32 3)
+; AVX1-NEXT:    [[R13:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A13]], i32 [[B13]], i32 3)
+; AVX1-NEXT:    [[R14:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A14]], i32 [[B14]], i32 3)
+; AVX1-NEXT:    [[R15:%.*]] = call i32 @llvm.umul.fix.i32(i32 [[A15]], i32 [[B15]], i32 3)
+; AVX1-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; AVX1-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; AVX1-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; AVX1-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; AVX1-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; AVX1-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; AVX1-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; AVX1-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; AVX1-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; AVX1-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; AVX1-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; AVX1-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; AVX1-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; AVX1-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @umul_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
+; AVX2-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @umul_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umul.fix.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]], i32 3)
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; AVX256BW-LABEL: @umul_v16i32(
+; AVX256BW-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3)
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3)
+; AVX256BW-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX256BW-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX256BW-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = call i32 @llvm.umul.fix.i32(i32 %a0 , i32 %b0 , i32 3)
+  %r1  = call i32 @llvm.umul.fix.i32(i32 %a1 , i32 %b1 , i32 3)
+  %r2  = call i32 @llvm.umul.fix.i32(i32 %a2 , i32 %b2 , i32 3)
+  %r3  = call i32 @llvm.umul.fix.i32(i32 %a3 , i32 %b3 , i32 3)
+  %r4  = call i32 @llvm.umul.fix.i32(i32 %a4 , i32 %b4 , i32 3)
+  %r5  = call i32 @llvm.umul.fix.i32(i32 %a5 , i32 %b5 , i32 3)
+  %r6  = call i32 @llvm.umul.fix.i32(i32 %a6 , i32 %b6 , i32 3)
+  %r7  = call i32 @llvm.umul.fix.i32(i32 %a7 , i32 %b7 , i32 3)
+  %r8  = call i32 @llvm.umul.fix.i32(i32 %a8 , i32 %b8 , i32 3)
+  %r9  = call i32 @llvm.umul.fix.i32(i32 %a9 , i32 %b9 , i32 3)
+  %r10 = call i32 @llvm.umul.fix.i32(i32 %a10, i32 %b10, i32 3)
+  %r11 = call i32 @llvm.umul.fix.i32(i32 %a11, i32 %b11, i32 3)
+  %r12 = call i32 @llvm.umul.fix.i32(i32 %a12, i32 %b12, i32 3)
+  %r13 = call i32 @llvm.umul.fix.i32(i32 %a13, i32 %b13, i32 3)
+  %r14 = call i32 @llvm.umul.fix.i32(i32 %a14, i32 %b14, i32 3)
+  %r15 = call i32 @llvm.umul.fix.i32(i32 %a15, i32 %b15, i32 3)
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @umul_v32i16() {
+; SSE-LABEL: @umul_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @umul_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3)
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3)
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3)
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3)
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @umul_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @umul_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3)
+; AVX512-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3)
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = call i16 @llvm.umul.fix.i16(i16 %a0 , i16 %b0 , i32 3)
+  %r1  = call i16 @llvm.umul.fix.i16(i16 %a1 , i16 %b1 , i32 3)
+  %r2  = call i16 @llvm.umul.fix.i16(i16 %a2 , i16 %b2 , i32 3)
+  %r3  = call i16 @llvm.umul.fix.i16(i16 %a3 , i16 %b3 , i32 3)
+  %r4  = call i16 @llvm.umul.fix.i16(i16 %a4 , i16 %b4 , i32 3)
+  %r5  = call i16 @llvm.umul.fix.i16(i16 %a5 , i16 %b5 , i32 3)
+  %r6  = call i16 @llvm.umul.fix.i16(i16 %a6 , i16 %b6 , i32 3)
+  %r7  = call i16 @llvm.umul.fix.i16(i16 %a7 , i16 %b7 , i32 3)
+  %r8  = call i16 @llvm.umul.fix.i16(i16 %a8 , i16 %b8 , i32 3)
+  %r9  = call i16 @llvm.umul.fix.i16(i16 %a9 , i16 %b9 , i32 3)
+  %r10 = call i16 @llvm.umul.fix.i16(i16 %a10, i16 %b10, i32 3)
+  %r11 = call i16 @llvm.umul.fix.i16(i16 %a11, i16 %b11, i32 3)
+  %r12 = call i16 @llvm.umul.fix.i16(i16 %a12, i16 %b12, i32 3)
+  %r13 = call i16 @llvm.umul.fix.i16(i16 %a13, i16 %b13, i32 3)
+  %r14 = call i16 @llvm.umul.fix.i16(i16 %a14, i16 %b14, i32 3)
+  %r15 = call i16 @llvm.umul.fix.i16(i16 %a15, i16 %b15, i32 3)
+  %r16 = call i16 @llvm.umul.fix.i16(i16 %a16, i16 %b16, i32 3)
+  %r17 = call i16 @llvm.umul.fix.i16(i16 %a17, i16 %b17, i32 3)
+  %r18 = call i16 @llvm.umul.fix.i16(i16 %a18, i16 %b18, i32 3)
+  %r19 = call i16 @llvm.umul.fix.i16(i16 %a19, i16 %b19, i32 3)
+  %r20 = call i16 @llvm.umul.fix.i16(i16 %a20, i16 %b20, i32 3)
+  %r21 = call i16 @llvm.umul.fix.i16(i16 %a21, i16 %b21, i32 3)
+  %r22 = call i16 @llvm.umul.fix.i16(i16 %a22, i16 %b22, i32 3)
+  %r23 = call i16 @llvm.umul.fix.i16(i16 %a23, i16 %b23, i32 3)
+  %r24 = call i16 @llvm.umul.fix.i16(i16 %a24, i16 %b24, i32 3)
+  %r25 = call i16 @llvm.umul.fix.i16(i16 %a25, i16 %b25, i32 3)
+  %r26 = call i16 @llvm.umul.fix.i16(i16 %a26, i16 %b26, i32 3)
+  %r27 = call i16 @llvm.umul.fix.i16(i16 %a27, i16 %b27, i32 3)
+  %r28 = call i16 @llvm.umul.fix.i16(i16 %a28, i16 %b28, i32 3)
+  %r29 = call i16 @llvm.umul.fix.i16(i16 %a29, i16 %b29, i32 3)
+  %r30 = call i16 @llvm.umul.fix.i16(i16 %a30, i16 %b30, i32 3)
+  %r31 = call i16 @llvm.umul.fix.i16(i16 %a31, i16 %b31, i32 3)
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @umul_v64i8() {
+; CHECK-LABEL: @umul_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3)
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3)
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = call i8 @llvm.umul.fix.i8(i8 %a0 , i8 %b0 , i32 3)
+  %r1  = call i8 @llvm.umul.fix.i8(i8 %a1 , i8 %b1 , i32 3)
+  %r2  = call i8 @llvm.umul.fix.i8(i8 %a2 , i8 %b2 , i32 3)
+  %r3  = call i8 @llvm.umul.fix.i8(i8 %a3 , i8 %b3 , i32 3)
+  %r4  = call i8 @llvm.umul.fix.i8(i8 %a4 , i8 %b4 , i32 3)
+  %r5  = call i8 @llvm.umul.fix.i8(i8 %a5 , i8 %b5 , i32 3)
+  %r6  = call i8 @llvm.umul.fix.i8(i8 %a6 , i8 %b6 , i32 3)
+  %r7  = call i8 @llvm.umul.fix.i8(i8 %a7 , i8 %b7 , i32 3)
+  %r8  = call i8 @llvm.umul.fix.i8(i8 %a8 , i8 %b8 , i32 3)
+  %r9  = call i8 @llvm.umul.fix.i8(i8 %a9 , i8 %b9 , i32 3)
+  %r10 = call i8 @llvm.umul.fix.i8(i8 %a10, i8 %b10, i32 3)
+  %r11 = call i8 @llvm.umul.fix.i8(i8 %a11, i8 %b11, i32 3)
+  %r12 = call i8 @llvm.umul.fix.i8(i8 %a12, i8 %b12, i32 3)
+  %r13 = call i8 @llvm.umul.fix.i8(i8 %a13, i8 %b13, i32 3)
+  %r14 = call i8 @llvm.umul.fix.i8(i8 %a14, i8 %b14, i32 3)
+  %r15 = call i8 @llvm.umul.fix.i8(i8 %a15, i8 %b15, i32 3)
+  %r16 = call i8 @llvm.umul.fix.i8(i8 %a16, i8 %b16, i32 3)
+  %r17 = call i8 @llvm.umul.fix.i8(i8 %a17, i8 %b17, i32 3)
+  %r18 = call i8 @llvm.umul.fix.i8(i8 %a18, i8 %b18, i32 3)
+  %r19 = call i8 @llvm.umul.fix.i8(i8 %a19, i8 %b19, i32 3)
+  %r20 = call i8 @llvm.umul.fix.i8(i8 %a20, i8 %b20, i32 3)
+  %r21 = call i8 @llvm.umul.fix.i8(i8 %a21, i8 %b21, i32 3)
+  %r22 = call i8 @llvm.umul.fix.i8(i8 %a22, i8 %b22, i32 3)
+  %r23 = call i8 @llvm.umul.fix.i8(i8 %a23, i8 %b23, i32 3)
+  %r24 = call i8 @llvm.umul.fix.i8(i8 %a24, i8 %b24, i32 3)
+  %r25 = call i8 @llvm.umul.fix.i8(i8 %a25, i8 %b25, i32 3)
+  %r26 = call i8 @llvm.umul.fix.i8(i8 %a26, i8 %b26, i32 3)
+  %r27 = call i8 @llvm.umul.fix.i8(i8 %a27, i8 %b27, i32 3)
+  %r28 = call i8 @llvm.umul.fix.i8(i8 %a28, i8 %b28, i32 3)
+  %r29 = call i8 @llvm.umul.fix.i8(i8 %a29, i8 %b29, i32 3)
+  %r30 = call i8 @llvm.umul.fix.i8(i8 %a30, i8 %b30, i32 3)
+  %r31 = call i8 @llvm.umul.fix.i8(i8 %a31, i8 %b31, i32 3)
+  %r32 = call i8 @llvm.umul.fix.i8(i8 %a32, i8 %b32, i32 3)
+  %r33 = call i8 @llvm.umul.fix.i8(i8 %a33, i8 %b33, i32 3)
+  %r34 = call i8 @llvm.umul.fix.i8(i8 %a34, i8 %b34, i32 3)
+  %r35 = call i8 @llvm.umul.fix.i8(i8 %a35, i8 %b35, i32 3)
+  %r36 = call i8 @llvm.umul.fix.i8(i8 %a36, i8 %b36, i32 3)
+  %r37 = call i8 @llvm.umul.fix.i8(i8 %a37, i8 %b37, i32 3)
+  %r38 = call i8 @llvm.umul.fix.i8(i8 %a38, i8 %b38, i32 3)
+  %r39 = call i8 @llvm.umul.fix.i8(i8 %a39, i8 %b39, i32 3)
+  %r40 = call i8 @llvm.umul.fix.i8(i8 %a40, i8 %b40, i32 3)
+  %r41 = call i8 @llvm.umul.fix.i8(i8 %a41, i8 %b41, i32 3)
+  %r42 = call i8 @llvm.umul.fix.i8(i8 %a42, i8 %b42, i32 3)
+  %r43 = call i8 @llvm.umul.fix.i8(i8 %a43, i8 %b43, i32 3)
+  %r44 = call i8 @llvm.umul.fix.i8(i8 %a44, i8 %b44, i32 3)
+  %r45 = call i8 @llvm.umul.fix.i8(i8 %a45, i8 %b45, i32 3)
+  %r46 = call i8 @llvm.umul.fix.i8(i8 %a46, i8 %b46, i32 3)
+  %r47 = call i8 @llvm.umul.fix.i8(i8 %a47, i8 %b47, i32 3)
+  %r48 = call i8 @llvm.umul.fix.i8(i8 %a48, i8 %b48, i32 3)
+  %r49 = call i8 @llvm.umul.fix.i8(i8 %a49, i8 %b49, i32 3)
+  %r50 = call i8 @llvm.umul.fix.i8(i8 %a50, i8 %b50, i32 3)
+  %r51 = call i8 @llvm.umul.fix.i8(i8 %a51, i8 %b51, i32 3)
+  %r52 = call i8 @llvm.umul.fix.i8(i8 %a52, i8 %b52, i32 3)
+  %r53 = call i8 @llvm.umul.fix.i8(i8 %a53, i8 %b53, i32 3)
+  %r54 = call i8 @llvm.umul.fix.i8(i8 %a54, i8 %b54, i32 3)
+  %r55 = call i8 @llvm.umul.fix.i8(i8 %a55, i8 %b55, i32 3)
+  %r56 = call i8 @llvm.umul.fix.i8(i8 %a56, i8 %b56, i32 3)
+  %r57 = call i8 @llvm.umul.fix.i8(i8 %a57, i8 %b57, i32 3)
+  %r58 = call i8 @llvm.umul.fix.i8(i8 %a58, i8 %b58, i32 3)
+  %r59 = call i8 @llvm.umul.fix.i8(i8 %a59, i8 %b59, i32 3)
+  %r60 = call i8 @llvm.umul.fix.i8(i8 %a60, i8 %b60, i32 3)
+  %r61 = call i8 @llvm.umul.fix.i8(i8 %a61, i8 %b61, i32 3)
+  %r62 = call i8 @llvm.umul.fix.i8(i8 %a62, i8 %b62, i32 3)
+  %r63 = call i8 @llvm.umul.fix.i8(i8 %a63, i8 %b63, i32 3)
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
new file mode 100644
index 00000000000..119cf594c90
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -0,0 +1,1323 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+
+;
+; 128-bit Vectors
+;
+
+define <2 x double> @buildvector_add_2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @buildvector_add_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    ret <2 x double> [[R1]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %c0 = fadd double %a0, %b0
+  %c1 = fadd double %a1, %b1
+  %r0 = insertelement <2 x double> undef, double %c0, i32 0
+  %r1 = insertelement <2 x double> %r0,   double %c1, i32 1
+  ret <2 x double> %r1
+}
+
+define <2 x double> @buildvector_sub_2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @buildvector_sub_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    ret <2 x double> [[R1]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %c0 = fsub double %a0, %b0
+  %c1 = fsub double %a1, %b1
+  %r0 = insertelement <2 x double> undef, double %c0, i32 0
+  %r1 = insertelement <2 x double> %r0,   double %c1, i32 1
+  ret <2 x double> %r1
+}
+
+define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @buildvector_mul_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    ret <2 x double> [[R1]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %c0 = fmul double %a0, %b0
+  %c1 = fmul double %a1, %b1
+  %r0 = insertelement <2 x double> undef, double %c0, i32 0
+  %r1 = insertelement <2 x double> %r0,   double %c1, i32 1
+  ret <2 x double> %r1
+}
+
+define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: @buildvector_div_2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
+; SSE-NEXT:    ret <2 x double> [[R1]]
+;
+; SLM-LABEL: @buildvector_div_2f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    ret <2 x double> [[R1]]
+;
+; AVX-LABEL: @buildvector_div_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; AVX-NEXT:    [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; AVX-NEXT:    [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
+; AVX-NEXT:    ret <2 x double> [[R1]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %c0 = fdiv double %a0, %b0
+  %c1 = fdiv double %a1, %b1
+  %r0 = insertelement <2 x double> undef, double %c0, i32 0
+  %r1 = insertelement <2 x double> %r0,   double %c1, i32 1
+  ret <2 x double> %r1
+}
+
+define <4 x float> @buildvector_add_4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @buildvector_add_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[R3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c0 = fadd float %a0, %b0
+  %c1 = fadd float %a1, %b1
+  %c2 = fadd float %a2, %b2
+  %c3 = fadd float %a3, %b3
+  %r0 = insertelement <4 x float> undef, float %c0, i32 0
+  %r1 = insertelement <4 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <4 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <4 x float> %r2,   float %c3, i32 3
+  ret <4 x float> %r3
+}
+
+define <4 x float> @buildvector_sub_4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @buildvector_sub_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[R3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c0 = fsub float %a0, %b0
+  %c1 = fsub float %a1, %b1
+  %c2 = fsub float %a2, %b2
+  %c3 = fsub float %a3, %b3
+  %r0 = insertelement <4 x float> undef, float %c0, i32 0
+  %r1 = insertelement <4 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <4 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <4 x float> %r2,   float %c3, i32 3
+  ret <4 x float> %r3
+}
+
+define <4 x float> @buildvector_mul_4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @buildvector_mul_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[R3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c0 = fmul float %a0, %b0
+  %c1 = fmul float %a1, %b1
+  %c2 = fmul float %a2, %b2
+  %c3 = fmul float %a3, %b3
+  %r0 = insertelement <4 x float> undef, float %c0, i32 0
+  %r1 = insertelement <4 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <4 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <4 x float> %r2,   float %c3, i32 3
+  ret <4 x float> %r3
+}
+
+define <4 x float> @buildvector_div_4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @buildvector_div_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[R3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c0 = fdiv float %a0, %b0
+  %c1 = fdiv float %a1, %b1
+  %c2 = fdiv float %a2, %b2
+  %c3 = fdiv float %a3, %b3
+  %r0 = insertelement <4 x float> undef, float %c0, i32 0
+  %r1 = insertelement <4 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <4 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <4 x float> %r2,   float %c3, i32 3
+  ret <4 x float> %r3
+}
+
+;
+; 256-bit Vectors
+;
+
+define <4 x double> @buildvector_add_4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @buildvector_add_4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x double> [[R3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %c0 = fadd double %a0, %b0
+  %c1 = fadd double %a1, %b1
+  %c2 = fadd double %a2, %b2
+  %c3 = fadd double %a3, %b3
+  %r0 = insertelement <4 x double> undef, double %c0, i32 0
+  %r1 = insertelement <4 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <4 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <4 x double> %r2,   double %c3, i32 3
+  ret <4 x double> %r3
+}
+
+define <4 x double> @buildvector_sub_4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @buildvector_sub_4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x double> [[R3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %c0 = fsub double %a0, %b0
+  %c1 = fsub double %a1, %b1
+  %c2 = fsub double %a2, %b2
+  %c3 = fsub double %a3, %b3
+  %r0 = insertelement <4 x double> undef, double %c0, i32 0
+  %r1 = insertelement <4 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <4 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <4 x double> %r2,   double %c3, i32 3
+  ret <4 x double> %r3
+}
+
+define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @buildvector_mul_4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x double> [[R3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %c0 = fmul double %a0, %b0
+  %c1 = fmul double %a1, %b1
+  %c2 = fmul double %a2, %b2
+  %c3 = fmul double %a3, %b3
+  %r0 = insertelement <4 x double> undef, double %c0, i32 0
+  %r1 = insertelement <4 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <4 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <4 x double> %r2,   double %c3, i32 3
+  ret <4 x double> %r3
+}
+
+define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @buildvector_div_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
+; SSE-NEXT:    ret <4 x double> [[R3]]
+;
+; SLM-LABEL: @buildvector_div_4f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
+; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    ret <4 x double> [[R3]]
+;
+; AVX-LABEL: @buildvector_div_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX-NEXT:    [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX-NEXT:    [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX-NEXT:    [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX-NEXT:    [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
+; AVX-NEXT:    ret <4 x double> [[R3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %c0 = fdiv double %a0, %b0
+  %c1 = fdiv double %a1, %b1
+  %c2 = fdiv double %a2, %b2
+  %c3 = fdiv double %a3, %b3
+  %r0 = insertelement <4 x double> undef, double %c0, i32 0
+  %r1 = insertelement <4 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <4 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <4 x double> %r2,   double %c3, i32 3
+  ret <4 x double> %r3
+}
+
+define <8 x float> @buildvector_add_8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @buildvector_add_8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %c0 = fadd float %a0, %b0
+  %c1 = fadd float %a1, %b1
+  %c2 = fadd float %a2, %b2
+  %c3 = fadd float %a3, %b3
+  %c4 = fadd float %a4, %b4
+  %c5 = fadd float %a5, %b5
+  %c6 = fadd float %a6, %b6
+  %c7 = fadd float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %c0, i32 0
+  %r1 = insertelement <8 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <8 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <8 x float> %r2,   float %c3, i32 3
+  %r4 = insertelement <8 x float> %r3,   float %c4, i32 4
+  %r5 = insertelement <8 x float> %r4,   float %c5, i32 5
+  %r6 = insertelement <8 x float> %r5,   float %c6, i32 6
+  %r7 = insertelement <8 x float> %r6,   float %c7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x float> @buildvector_sub_8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @buildvector_sub_8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %c0 = fsub float %a0, %b0
+  %c1 = fsub float %a1, %b1
+  %c2 = fsub float %a2, %b2
+  %c3 = fsub float %a3, %b3
+  %c4 = fsub float %a4, %b4
+  %c5 = fsub float %a5, %b5
+  %c6 = fsub float %a6, %b6
+  %c7 = fsub float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %c0, i32 0
+  %r1 = insertelement <8 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <8 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <8 x float> %r2,   float %c3, i32 3
+  %r4 = insertelement <8 x float> %r3,   float %c4, i32 4
+  %r5 = insertelement <8 x float> %r4,   float %c5, i32 5
+  %r6 = insertelement <8 x float> %r5,   float %c6, i32 6
+  %r7 = insertelement <8 x float> %r6,   float %c7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x float> @buildvector_mul_8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @buildvector_mul_8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %c0 = fmul float %a0, %b0
+  %c1 = fmul float %a1, %b1
+  %c2 = fmul float %a2, %b2
+  %c3 = fmul float %a3, %b3
+  %c4 = fmul float %a4, %b4
+  %c5 = fmul float %a5, %b5
+  %c6 = fmul float %a6, %b6
+  %c7 = fmul float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %c0, i32 0
+  %r1 = insertelement <8 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <8 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <8 x float> %r2,   float %c3, i32 3
+  %r4 = insertelement <8 x float> %r3,   float %c4, i32 4
+  %r5 = insertelement <8 x float> %r4,   float %c5, i32 5
+  %r6 = insertelement <8 x float> %r5,   float %c6, i32 6
+  %r7 = insertelement <8 x float> %r6,   float %c7, i32 7
+  ret <8 x float> %r7
+}
+
+define <8 x float> @buildvector_div_8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @buildvector_div_8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R7]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %c0 = fdiv float %a0, %b0
+  %c1 = fdiv float %a1, %b1
+  %c2 = fdiv float %a2, %b2
+  %c3 = fdiv float %a3, %b3
+  %c4 = fdiv float %a4, %b4
+  %c5 = fdiv float %a5, %b5
+  %c6 = fdiv float %a6, %b6
+  %c7 = fdiv float %a7, %b7
+  %r0 = insertelement <8 x float> undef, float %c0, i32 0
+  %r1 = insertelement <8 x float> %r0,   float %c1, i32 1
+  %r2 = insertelement <8 x float> %r1,   float %c2, i32 2
+  %r3 = insertelement <8 x float> %r2,   float %c3, i32 3
+  %r4 = insertelement <8 x float> %r3,   float %c4, i32 4
+  %r5 = insertelement <8 x float> %r4,   float %c5, i32 5
+  %r6 = insertelement <8 x float> %r5,   float %c6, i32 6
+  %r7 = insertelement <8 x float> %r6,   float %c7, i32 7
+  ret <8 x float> %r7
+}
+
+;
+; 512-bit Vectors
+;
+
+define <8 x double> @buildvector_add_8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @buildvector_add_8f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x double> [[R7]]
+;
+  %a0 = extractelement <8 x double> %a, i32 0
+  %a1 = extractelement <8 x double> %a, i32 1
+  %a2 = extractelement <8 x double> %a, i32 2
+  %a3 = extractelement <8 x double> %a, i32 3
+  %a4 = extractelement <8 x double> %a, i32 4
+  %a5 = extractelement <8 x double> %a, i32 5
+  %a6 = extractelement <8 x double> %a, i32 6
+  %a7 = extractelement <8 x double> %a, i32 7
+  %b0 = extractelement <8 x double> %b, i32 0
+  %b1 = extractelement <8 x double> %b, i32 1
+  %b2 = extractelement <8 x double> %b, i32 2
+  %b3 = extractelement <8 x double> %b, i32 3
+  %b4 = extractelement <8 x double> %b, i32 4
+  %b5 = extractelement <8 x double> %b, i32 5
+  %b6 = extractelement <8 x double> %b, i32 6
+  %b7 = extractelement <8 x double> %b, i32 7
+  %c0 = fadd double %a0, %b0
+  %c1 = fadd double %a1, %b1
+  %c2 = fadd double %a2, %b2
+  %c3 = fadd double %a3, %b3
+  %c4 = fadd double %a4, %b4
+  %c5 = fadd double %a5, %b5
+  %c6 = fadd double %a6, %b6
+  %c7 = fadd double %a7, %b7
+  %r0 = insertelement <8 x double> undef, double %c0, i32 0
+  %r1 = insertelement <8 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <8 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <8 x double> %r2,   double %c3, i32 3
+  %r4 = insertelement <8 x double> %r3,   double %c4, i32 4
+  %r5 = insertelement <8 x double> %r4,   double %c5, i32 5
+  %r6 = insertelement <8 x double> %r5,   double %c6, i32 6
+  %r7 = insertelement <8 x double> %r6,   double %c7, i32 7
+  ret <8 x double> %r7
+}
+
+define <8 x double> @buildvector_sub_8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @buildvector_sub_8f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x double> [[R7]]
+;
+  %a0 = extractelement <8 x double> %a, i32 0
+  %a1 = extractelement <8 x double> %a, i32 1
+  %a2 = extractelement <8 x double> %a, i32 2
+  %a3 = extractelement <8 x double> %a, i32 3
+  %a4 = extractelement <8 x double> %a, i32 4
+  %a5 = extractelement <8 x double> %a, i32 5
+  %a6 = extractelement <8 x double> %a, i32 6
+  %a7 = extractelement <8 x double> %a, i32 7
+  %b0 = extractelement <8 x double> %b, i32 0
+  %b1 = extractelement <8 x double> %b, i32 1
+  %b2 = extractelement <8 x double> %b, i32 2
+  %b3 = extractelement <8 x double> %b, i32 3
+  %b4 = extractelement <8 x double> %b, i32 4
+  %b5 = extractelement <8 x double> %b, i32 5
+  %b6 = extractelement <8 x double> %b, i32 6
+  %b7 = extractelement <8 x double> %b, i32 7
+  %c0 = fsub double %a0, %b0
+  %c1 = fsub double %a1, %b1
+  %c2 = fsub double %a2, %b2
+  %c3 = fsub double %a3, %b3
+  %c4 = fsub double %a4, %b4
+  %c5 = fsub double %a5, %b5
+  %c6 = fsub double %a6, %b6
+  %c7 = fsub double %a7, %b7
+  %r0 = insertelement <8 x double> undef, double %c0, i32 0
+  %r1 = insertelement <8 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <8 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <8 x double> %r2,   double %c3, i32 3
+  %r4 = insertelement <8 x double> %r3,   double %c4, i32 4
+  %r5 = insertelement <8 x double> %r4,   double %c5, i32 5
+  %r6 = insertelement <8 x double> %r5,   double %c6, i32 6
+  %r7 = insertelement <8 x double> %r6,   double %c7, i32 7
+  ret <8 x double> %r7
+}
+
+define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @buildvector_mul_8f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <8 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
+; CHECK-NEXT:    ret <8 x double> [[R7]]
+;
+  %a0 = extractelement <8 x double> %a, i32 0
+  %a1 = extractelement <8 x double> %a, i32 1
+  %a2 = extractelement <8 x double> %a, i32 2
+  %a3 = extractelement <8 x double> %a, i32 3
+  %a4 = extractelement <8 x double> %a, i32 4
+  %a5 = extractelement <8 x double> %a, i32 5
+  %a6 = extractelement <8 x double> %a, i32 6
+  %a7 = extractelement <8 x double> %a, i32 7
+  %b0 = extractelement <8 x double> %b, i32 0
+  %b1 = extractelement <8 x double> %b, i32 1
+  %b2 = extractelement <8 x double> %b, i32 2
+  %b3 = extractelement <8 x double> %b, i32 3
+  %b4 = extractelement <8 x double> %b, i32 4
+  %b5 = extractelement <8 x double> %b, i32 5
+  %b6 = extractelement <8 x double> %b, i32 6
+  %b7 = extractelement <8 x double> %b, i32 7
+  %c0 = fmul double %a0, %b0
+  %c1 = fmul double %a1, %b1
+  %c2 = fmul double %a2, %b2
+  %c3 = fmul double %a3, %b3
+  %c4 = fmul double %a4, %b4
+  %c5 = fmul double %a5, %b5
+  %c6 = fmul double %a6, %b6
+  %c7 = fmul double %a7, %b7
+  %r0 = insertelement <8 x double> undef, double %c0, i32 0
+  %r1 = insertelement <8 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <8 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <8 x double> %r2,   double %c3, i32 3
+  %r4 = insertelement <8 x double> %r3,   double %c4, i32 4
+  %r5 = insertelement <8 x double> %r4,   double %c5, i32 5
+  %r6 = insertelement <8 x double> %r5,   double %c6, i32 6
+  %r7 = insertelement <8 x double> %r6,   double %c7, i32 7
+  ret <8 x double> %r7
+}
+
+define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
+; SSE-LABEL: @buildvector_div_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
+; SSE-NEXT:    ret <8 x double> [[R7]]
+;
+; SLM-LABEL: @buildvector_div_8f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
+; SLM-NEXT:    ret <8 x double> [[R7]]
+;
+; AVX-LABEL: @buildvector_div_8f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
+; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
+; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
+; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
+; AVX-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
+; AVX-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
+; AVX-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
+; AVX-NEXT:    [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
+; AVX-NEXT:    ret <8 x double> [[R7]]
+;
+  %a0 = extractelement <8 x double> %a, i32 0
+  %a1 = extractelement <8 x double> %a, i32 1
+  %a2 = extractelement <8 x double> %a, i32 2
+  %a3 = extractelement <8 x double> %a, i32 3
+  %a4 = extractelement <8 x double> %a, i32 4
+  %a5 = extractelement <8 x double> %a, i32 5
+  %a6 = extractelement <8 x double> %a, i32 6
+  %a7 = extractelement <8 x double> %a, i32 7
+  %b0 = extractelement <8 x double> %b, i32 0
+  %b1 = extractelement <8 x double> %b, i32 1
+  %b2 = extractelement <8 x double> %b, i32 2
+  %b3 = extractelement <8 x double> %b, i32 3
+  %b4 = extractelement <8 x double> %b, i32 4
+  %b5 = extractelement <8 x double> %b, i32 5
+  %b6 = extractelement <8 x double> %b, i32 6
+  %b7 = extractelement <8 x double> %b, i32 7
+  %c0 = fdiv double %a0, %b0
+  %c1 = fdiv double %a1, %b1
+  %c2 = fdiv double %a2, %b2
+  %c3 = fdiv double %a3, %b3
+  %c4 = fdiv double %a4, %b4
+  %c5 = fdiv double %a5, %b5
+  %c6 = fdiv double %a6, %b6
+  %c7 = fdiv double %a7, %b7
+  %r0 = insertelement <8 x double> undef, double %c0, i32 0
+  %r1 = insertelement <8 x double> %r0,   double %c1, i32 1
+  %r2 = insertelement <8 x double> %r1,   double %c2, i32 2
+  %r3 = insertelement <8 x double> %r2,   double %c3, i32 3
+  %r4 = insertelement <8 x double> %r3,   double %c4, i32 4
+  %r5 = insertelement <8 x double> %r4,   double %c5, i32 5
+  %r6 = insertelement <8 x double> %r5,   double %c6, i32 6
+  %r7 = insertelement <8 x double> %r6,   double %c7, i32 7
+  ret <8 x double> %r7
+}
+
+define <16 x float> @buildvector_add_16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @buildvector_add_16f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
+; CHECK-NEXT:    ret <16 x float> [[R15]]
+;
+  %a0  = extractelement <16 x float> %a, i32 0
+  %a1  = extractelement <16 x float> %a, i32 1
+  %a2  = extractelement <16 x float> %a, i32 2
+  %a3  = extractelement <16 x float> %a, i32 3
+  %a4  = extractelement <16 x float> %a, i32 4
+  %a5  = extractelement <16 x float> %a, i32 5
+  %a6  = extractelement <16 x float> %a, i32 6
+  %a7  = extractelement <16 x float> %a, i32 7
+  %a8  = extractelement <16 x float> %a, i32 8
+  %a9  = extractelement <16 x float> %a, i32 9
+  %a10 = extractelement <16 x float> %a, i32 10
+  %a11 = extractelement <16 x float> %a, i32 11
+  %a12 = extractelement <16 x float> %a, i32 12
+  %a13 = extractelement <16 x float> %a, i32 13
+  %a14 = extractelement <16 x float> %a, i32 14
+  %a15 = extractelement <16 x float> %a, i32 15
+  %b0  = extractelement <16 x float> %b, i32 0
+  %b1  = extractelement <16 x float> %b, i32 1
+  %b2  = extractelement <16 x float> %b, i32 2
+  %b3  = extractelement <16 x float> %b, i32 3
+  %b4  = extractelement <16 x float> %b, i32 4
+  %b5  = extractelement <16 x float> %b, i32 5
+  %b6  = extractelement <16 x float> %b, i32 6
+  %b7  = extractelement <16 x float> %b, i32 7
+  %b8  = extractelement <16 x float> %b, i32 8
+  %b9  = extractelement <16 x float> %b, i32 9
+  %b10 = extractelement <16 x float> %b, i32 10
+  %b11 = extractelement <16 x float> %b, i32 11
+  %b12 = extractelement <16 x float> %b, i32 12
+  %b13 = extractelement <16 x float> %b, i32 13
+  %b14 = extractelement <16 x float> %b, i32 14
+  %b15 = extractelement <16 x float> %b, i32 15
+  %c0  = fadd float %a0 , %b0
+  %c1  = fadd float %a1 , %b1
+  %c2  = fadd float %a2 , %b2
+  %c3  = fadd float %a3 , %b3
+  %c4  = fadd float %a4 , %b4
+  %c5  = fadd float %a5 , %b5
+  %c6  = fadd float %a6 , %b6
+  %c7  = fadd float %a7 , %b7
+  %c8  = fadd float %a8 , %b8
+  %c9  = fadd float %a9 , %b9
+  %c10 = fadd float %a10, %b10
+  %c11 = fadd float %a11, %b11
+  %c12 = fadd float %a12, %b12
+  %c13 = fadd float %a13, %b13
+  %c14 = fadd float %a14, %b14
+  %c15 = fadd float %a15, %b15
+  %r0  = insertelement <16 x float> undef, float %c0 , i32 0
+  %r1  = insertelement <16 x float> %r0 ,  float %c1 , i32 1
+  %r2  = insertelement <16 x float> %r1 ,  float %c2 , i32 2
+  %r3  = insertelement <16 x float> %r2 ,  float %c3 , i32 3
+  %r4  = insertelement <16 x float> %r3 ,  float %c4 , i32 4
+  %r5  = insertelement <16 x float> %r4 ,  float %c5 , i32 5
+  %r6  = insertelement <16 x float> %r5 ,  float %c6 , i32 6
+  %r7  = insertelement <16 x float> %r6 ,  float %c7 , i32 7
+  %r8  = insertelement <16 x float> %r7 ,  float %c8 , i32 8
+  %r9  = insertelement <16 x float> %r8 ,  float %c9 , i32 9
+  %r10 = insertelement <16 x float> %r9 ,  float %c10, i32 10
+  %r11 = insertelement <16 x float> %r10,  float %c11, i32 11
+  %r12 = insertelement <16 x float> %r11,  float %c12, i32 12
+  %r13 = insertelement <16 x float> %r12,  float %c13, i32 13
+  %r14 = insertelement <16 x float> %r13,  float %c14, i32 14
+  %r15 = insertelement <16 x float> %r14,  float %c15, i32 15
+  ret <16 x float> %r15
+}
+
+define <16 x float> @buildvector_sub_16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @buildvector_sub_16f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
+; CHECK-NEXT:    ret <16 x float> [[R15]]
+;
+  %a0  = extractelement <16 x float> %a, i32 0
+  %a1  = extractelement <16 x float> %a, i32 1
+  %a2  = extractelement <16 x float> %a, i32 2
+  %a3  = extractelement <16 x float> %a, i32 3
+  %a4  = extractelement <16 x float> %a, i32 4
+  %a5  = extractelement <16 x float> %a, i32 5
+  %a6  = extractelement <16 x float> %a, i32 6
+  %a7  = extractelement <16 x float> %a, i32 7
+  %a8  = extractelement <16 x float> %a, i32 8
+  %a9  = extractelement <16 x float> %a, i32 9
+  %a10 = extractelement <16 x float> %a, i32 10
+  %a11 = extractelement <16 x float> %a, i32 11
+  %a12 = extractelement <16 x float> %a, i32 12
+  %a13 = extractelement <16 x float> %a, i32 13
+  %a14 = extractelement <16 x float> %a, i32 14
+  %a15 = extractelement <16 x float> %a, i32 15
+  %b0  = extractelement <16 x float> %b, i32 0
+  %b1  = extractelement <16 x float> %b, i32 1
+  %b2  = extractelement <16 x float> %b, i32 2
+  %b3  = extractelement <16 x float> %b, i32 3
+  %b4  = extractelement <16 x float> %b, i32 4
+  %b5  = extractelement <16 x float> %b, i32 5
+  %b6  = extractelement <16 x float> %b, i32 6
+  %b7  = extractelement <16 x float> %b, i32 7
+  %b8  = extractelement <16 x float> %b, i32 8
+  %b9  = extractelement <16 x float> %b, i32 9
+  %b10 = extractelement <16 x float> %b, i32 10
+  %b11 = extractelement <16 x float> %b, i32 11
+  %b12 = extractelement <16 x float> %b, i32 12
+  %b13 = extractelement <16 x float> %b, i32 13
+  %b14 = extractelement <16 x float> %b, i32 14
+  %b15 = extractelement <16 x float> %b, i32 15
+  %c0  = fsub float %a0 , %b0
+  %c1  = fsub float %a1 , %b1
+  %c2  = fsub float %a2 , %b2
+  %c3  = fsub float %a3 , %b3
+  %c4  = fsub float %a4 , %b4
+  %c5  = fsub float %a5 , %b5
+  %c6  = fsub float %a6 , %b6
+  %c7  = fsub float %a7 , %b7
+  %c8  = fsub float %a8 , %b8
+  %c9  = fsub float %a9 , %b9
+  %c10 = fsub float %a10, %b10
+  %c11 = fsub float %a11, %b11
+  %c12 = fsub float %a12, %b12
+  %c13 = fsub float %a13, %b13
+  %c14 = fsub float %a14, %b14
+  %c15 = fsub float %a15, %b15
+  %r0  = insertelement <16 x float> undef, float %c0 , i32 0
+  %r1  = insertelement <16 x float> %r0 ,  float %c1 , i32 1
+  %r2  = insertelement <16 x float> %r1 ,  float %c2 , i32 2
+  %r3  = insertelement <16 x float> %r2 ,  float %c3 , i32 3
+  %r4  = insertelement <16 x float> %r3 ,  float %c4 , i32 4
+  %r5  = insertelement <16 x float> %r4 ,  float %c5 , i32 5
+  %r6  = insertelement <16 x float> %r5 ,  float %c6 , i32 6
+  %r7  = insertelement <16 x float> %r6 ,  float %c7 , i32 7
+  %r8  = insertelement <16 x float> %r7 ,  float %c8 , i32 8
+  %r9  = insertelement <16 x float> %r8 ,  float %c9 , i32 9
+  %r10 = insertelement <16 x float> %r9 ,  float %c10, i32 10
+  %r11 = insertelement <16 x float> %r10,  float %c11, i32 11
+  %r12 = insertelement <16 x float> %r11,  float %c12, i32 12
+  %r13 = insertelement <16 x float> %r12,  float %c13, i32 13
+  %r14 = insertelement <16 x float> %r13,  float %c14, i32 14
+  %r15 = insertelement <16 x float> %r14,  float %c15, i32 15
+  ret <16 x float> %r15
+}
+
+define <16 x float> @buildvector_mul_16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @buildvector_mul_16f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
+; CHECK-NEXT:    ret <16 x float> [[R15]]
+;
+  %a0  = extractelement <16 x float> %a, i32 0
+  %a1  = extractelement <16 x float> %a, i32 1
+  %a2  = extractelement <16 x float> %a, i32 2
+  %a3  = extractelement <16 x float> %a, i32 3
+  %a4  = extractelement <16 x float> %a, i32 4
+  %a5  = extractelement <16 x float> %a, i32 5
+  %a6  = extractelement <16 x float> %a, i32 6
+  %a7  = extractelement <16 x float> %a, i32 7
+  %a8  = extractelement <16 x float> %a, i32 8
+  %a9  = extractelement <16 x float> %a, i32 9
+  %a10 = extractelement <16 x float> %a, i32 10
+  %a11 = extractelement <16 x float> %a, i32 11
+  %a12 = extractelement <16 x float> %a, i32 12
+  %a13 = extractelement <16 x float> %a, i32 13
+  %a14 = extractelement <16 x float> %a, i32 14
+  %a15 = extractelement <16 x float> %a, i32 15
+  %b0  = extractelement <16 x float> %b, i32 0
+  %b1  = extractelement <16 x float> %b, i32 1
+  %b2  = extractelement <16 x float> %b, i32 2
+  %b3  = extractelement <16 x float> %b, i32 3
+  %b4  = extractelement <16 x float> %b, i32 4
+  %b5  = extractelement <16 x float> %b, i32 5
+  %b6  = extractelement <16 x float> %b, i32 6
+  %b7  = extractelement <16 x float> %b, i32 7
+  %b8  = extractelement <16 x float> %b, i32 8
+  %b9  = extractelement <16 x float> %b, i32 9
+  %b10 = extractelement <16 x float> %b, i32 10
+  %b11 = extractelement <16 x float> %b, i32 11
+  %b12 = extractelement <16 x float> %b, i32 12
+  %b13 = extractelement <16 x float> %b, i32 13
+  %b14 = extractelement <16 x float> %b, i32 14
+  %b15 = extractelement <16 x float> %b, i32 15
+  %c0  = fmul float %a0 , %b0
+  %c1  = fmul float %a1 , %b1
+  %c2  = fmul float %a2 , %b2
+  %c3  = fmul float %a3 , %b3
+  %c4  = fmul float %a4 , %b4
+  %c5  = fmul float %a5 , %b5
+  %c6  = fmul float %a6 , %b6
+  %c7  = fmul float %a7 , %b7
+  %c8  = fmul float %a8 , %b8
+  %c9  = fmul float %a9 , %b9
+  %c10 = fmul float %a10, %b10
+  %c11 = fmul float %a11, %b11
+  %c12 = fmul float %a12, %b12
+  %c13 = fmul float %a13, %b13
+  %c14 = fmul float %a14, %b14
+  %c15 = fmul float %a15, %b15
+  %r0  = insertelement <16 x float> undef, float %c0 , i32 0
+  %r1  = insertelement <16 x float> %r0 ,  float %c1 , i32 1
+  %r2  = insertelement <16 x float> %r1 ,  float %c2 , i32 2
+  %r3  = insertelement <16 x float> %r2 ,  float %c3 , i32 3
+  %r4  = insertelement <16 x float> %r3 ,  float %c4 , i32 4
+  %r5  = insertelement <16 x float> %r4 ,  float %c5 , i32 5
+  %r6  = insertelement <16 x float> %r5 ,  float %c6 , i32 6
+  %r7  = insertelement <16 x float> %r6 ,  float %c7 , i32 7
+  %r8  = insertelement <16 x float> %r7 ,  float %c8 , i32 8
+  %r9  = insertelement <16 x float> %r8 ,  float %c9 , i32 9
+  %r10 = insertelement <16 x float> %r9 ,  float %c10, i32 10
+  %r11 = insertelement <16 x float> %r10,  float %c11, i32 11
+  %r12 = insertelement <16 x float> %r11,  float %c12, i32 12
+  %r13 = insertelement <16 x float> %r12,  float %c13, i32 13
+  %r14 = insertelement <16 x float> %r13,  float %c14, i32 14
+  %r15 = insertelement <16 x float> %r14,  float %c15, i32 15
+  ret <16 x float> %r15
+}
+
+define <16 x float> @buildvector_div_16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: @buildvector_div_16f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fdiv <16 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[R0:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[R1:%.*]] = insertelement <16 x float> [[R0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[R2:%.*]] = insertelement <16 x float> [[R1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[R3:%.*]] = insertelement <16 x float> [[R2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x float> [[TMP1]], i32 4
+; CHECK-NEXT:    [[R4:%.*]] = insertelement <16 x float> [[R3]], float [[TMP6]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x float> [[TMP1]], i32 5
+; CHECK-NEXT:    [[R5:%.*]] = insertelement <16 x float> [[R4]], float [[TMP7]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[TMP1]], i32 6
+; CHECK-NEXT:    [[R6:%.*]] = insertelement <16 x float> [[R5]], float [[TMP8]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x float> [[TMP1]], i32 7
+; CHECK-NEXT:    [[R7:%.*]] = insertelement <16 x float> [[R6]], float [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x float> [[TMP1]], i32 8
+; CHECK-NEXT:    [[R8:%.*]] = insertelement <16 x float> [[R7]], float [[TMP10]], i32 8
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x float> [[TMP1]], i32 9
+; CHECK-NEXT:    [[R9:%.*]] = insertelement <16 x float> [[R8]], float [[TMP11]], i32 9
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x float> [[TMP1]], i32 10
+; CHECK-NEXT:    [[R10:%.*]] = insertelement <16 x float> [[R9]], float [[TMP12]], i32 10
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x float> [[TMP1]], i32 11
+; CHECK-NEXT:    [[R11:%.*]] = insertelement <16 x float> [[R10]], float [[TMP13]], i32 11
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x float> [[TMP1]], i32 12
+; CHECK-NEXT:    [[R12:%.*]] = insertelement <16 x float> [[R11]], float [[TMP14]], i32 12
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x float> [[TMP1]], i32 13
+; CHECK-NEXT:    [[R13:%.*]] = insertelement <16 x float> [[R12]], float [[TMP15]], i32 13
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x float> [[TMP1]], i32 14
+; CHECK-NEXT:    [[R14:%.*]] = insertelement <16 x float> [[R13]], float [[TMP16]], i32 14
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x float> [[TMP1]], i32 15
+; CHECK-NEXT:    [[R15:%.*]] = insertelement <16 x float> [[R14]], float [[TMP17]], i32 15
+; CHECK-NEXT:    ret <16 x float> [[R15]]
+;
+  %a0  = extractelement <16 x float> %a, i32 0
+  %a1  = extractelement <16 x float> %a, i32 1
+  %a2  = extractelement <16 x float> %a, i32 2
+  %a3  = extractelement <16 x float> %a, i32 3
+  %a4  = extractelement <16 x float> %a, i32 4
+  %a5  = extractelement <16 x float> %a, i32 5
+  %a6  = extractelement <16 x float> %a, i32 6
+  %a7  = extractelement <16 x float> %a, i32 7
+  %a8  = extractelement <16 x float> %a, i32 8
+  %a9  = extractelement <16 x float> %a, i32 9
+  %a10 = extractelement <16 x float> %a, i32 10
+  %a11 = extractelement <16 x float> %a, i32 11
+  %a12 = extractelement <16 x float> %a, i32 12
+  %a13 = extractelement <16 x float> %a, i32 13
+  %a14 = extractelement <16 x float> %a, i32 14
+  %a15 = extractelement <16 x float> %a, i32 15
+  %b0  = extractelement <16 x float> %b, i32 0
+  %b1  = extractelement <16 x float> %b, i32 1
+  %b2  = extractelement <16 x float> %b, i32 2
+  %b3  = extractelement <16 x float> %b, i32 3
+  %b4  = extractelement <16 x float> %b, i32 4
+  %b5  = extractelement <16 x float> %b, i32 5
+  %b6  = extractelement <16 x float> %b, i32 6
+  %b7  = extractelement <16 x float> %b, i32 7
+  %b8  = extractelement <16 x float> %b, i32 8
+  %b9  = extractelement <16 x float> %b, i32 9
+  %b10 = extractelement <16 x float> %b, i32 10
+  %b11 = extractelement <16 x float> %b, i32 11
+  %b12 = extractelement <16 x float> %b, i32 12
+  %b13 = extractelement <16 x float> %b, i32 13
+  %b14 = extractelement <16 x float> %b, i32 14
+  %b15 = extractelement <16 x float> %b, i32 15
+  %c0  = fdiv float %a0 , %b0
+  %c1  = fdiv float %a1 , %b1
+  %c2  = fdiv float %a2 , %b2
+  %c3  = fdiv float %a3 , %b3
+  %c4  = fdiv float %a4 , %b4
+  %c5  = fdiv float %a5 , %b5
+  %c6  = fdiv float %a6 , %b6
+  %c7  = fdiv float %a7 , %b7
+  %c8  = fdiv float %a8 , %b8
+  %c9  = fdiv float %a9 , %b9
+  %c10 = fdiv float %a10, %b10
+  %c11 = fdiv float %a11, %b11
+  %c12 = fdiv float %a12, %b12
+  %c13 = fdiv float %a13, %b13
+  %c14 = fdiv float %a14, %b14
+  %c15 = fdiv float %a15, %b15
+  %r0  = insertelement <16 x float> undef, float %c0 , i32 0
+  %r1  = insertelement <16 x float> %r0 ,  float %c1 , i32 1
+  %r2  = insertelement <16 x float> %r1 ,  float %c2 , i32 2
+  %r3  = insertelement <16 x float> %r2 ,  float %c3 , i32 3
+  %r4  = insertelement <16 x float> %r3 ,  float %c4 , i32 4
+  %r5  = insertelement <16 x float> %r4 ,  float %c5 , i32 5
+  %r6  = insertelement <16 x float> %r5 ,  float %c6 , i32 6
+  %r7  = insertelement <16 x float> %r6 ,  float %c7 , i32 7
+  %r8  = insertelement <16 x float> %r7 ,  float %c8 , i32 8
+  %r9  = insertelement <16 x float> %r8 ,  float %c9 , i32 9
+  %r10 = insertelement <16 x float> %r9 ,  float %c10, i32 10
+  %r11 = insertelement <16 x float> %r10,  float %c11, i32 11
+  %r12 = insertelement <16 x float> %r11,  float %c12, i32 12
+  %r13 = insertelement <16 x float> %r12,  float %c13, i32 13
+  %r14 = insertelement <16 x float> %r13,  float %c14, i32 14
+  %r15 = insertelement <16 x float> %r14,  float %c15, i32 15
+  ret <16 x float> %r15
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll
new file mode 100644
index 00000000000..6aa7010bf2b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-smulo.ll
@@ -0,0 +1,1254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
+declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16)
+declare {i8 , i1} @llvm.smul.with.overflow.i8 (i8 , i8 )
+
+define void @mul_v8i64() {
+; CHECK-LABEL: @mul_v8i64(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[C0:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; CHECK-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; CHECK-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; CHECK-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; CHECK-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; CHECK-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; CHECK-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %c0 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a0, i64 %b0)
+  %c1 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a1, i64 %b1)
+  %c2 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a2, i64 %b2)
+  %c3 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a3, i64 %b3)
+  %c4 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a4, i64 %b4)
+  %c5 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a5, i64 %b5)
+  %c6 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a6, i64 %b6)
+  %c7 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %a7, i64 %b7)
+  %r0 = extractvalue {i64, i1} %c0, 0
+  %r1 = extractvalue {i64, i1} %c1, 0
+  %r2 = extractvalue {i64, i1} %c2, 0
+  %r3 = extractvalue {i64, i1} %c3, 0
+  %r4 = extractvalue {i64, i1} %c4, 0
+  %r5 = extractvalue {i64, i1} %c5, 0
+  %r6 = extractvalue {i64, i1} %c6, 0
+  %r7 = extractvalue {i64, i1} %c7, 0
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @mul_v16i32() {
+; CHECK-LABEL: @mul_v16i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[C0:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; CHECK-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; CHECK-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; CHECK-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; CHECK-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; CHECK-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; CHECK-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; CHECK-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; CHECK-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; CHECK-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; CHECK-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; CHECK-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; CHECK-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %c0  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a0 , i32 %b0 )
+  %c1  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a1 , i32 %b1 )
+  %c2  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a2 , i32 %b2 )
+  %c3  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a3 , i32 %b3 )
+  %c4  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a4 , i32 %b4 )
+  %c5  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a5 , i32 %b5 )
+  %c6  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a6 , i32 %b6 )
+  %c7  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a7 , i32 %b7 )
+  %c8  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a8 , i32 %b8 )
+  %c9  = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a9 , i32 %b9 )
+  %c10 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a10, i32 %b10)
+  %c11 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a11, i32 %b11)
+  %c12 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a12, i32 %b12)
+  %c13 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a13, i32 %b13)
+  %c14 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a14, i32 %b14)
+  %c15 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %a15, i32 %b15)
+  %r0  = extractvalue {i32, i1} %c0 , 0
+  %r1  = extractvalue {i32, i1} %c1 , 0
+  %r2  = extractvalue {i32, i1} %c2 , 0
+  %r3  = extractvalue {i32, i1} %c3 , 0
+  %r4  = extractvalue {i32, i1} %c4 , 0
+  %r5  = extractvalue {i32, i1} %c5 , 0
+  %r6  = extractvalue {i32, i1} %c6 , 0
+  %r7  = extractvalue {i32, i1} %c7 , 0
+  %r8  = extractvalue {i32, i1} %c8 , 0
+  %r9  = extractvalue {i32, i1} %c9 , 0
+  %r10 = extractvalue {i32, i1} %c10, 0
+  %r11 = extractvalue {i32, i1} %c11, 0
+  %r12 = extractvalue {i32, i1} %c12, 0
+  %r13 = extractvalue {i32, i1} %c13, 0
+  %r14 = extractvalue {i32, i1} %c14, 0
+  %r15 = extractvalue {i32, i1} %c15, 0
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @mul_v32i16() {
+; CHECK-LABEL: @mul_v32i16(
+; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[C0:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; CHECK-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; CHECK-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; CHECK-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; CHECK-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; CHECK-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; CHECK-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; CHECK-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; CHECK-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; CHECK-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; CHECK-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; CHECK-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; CHECK-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; CHECK-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; CHECK-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; CHECK-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; CHECK-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; CHECK-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; CHECK-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; CHECK-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; CHECK-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; CHECK-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; CHECK-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; CHECK-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; CHECK-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; CHECK-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; CHECK-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; CHECK-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; CHECK-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; CHECK-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; CHECK-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; CHECK-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %c0  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a0 , i16 %b0 )
+  %c1  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a1 , i16 %b1 )
+  %c2  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a2 , i16 %b2 )
+  %c3  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a3 , i16 %b3 )
+  %c4  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a4 , i16 %b4 )
+  %c5  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a5 , i16 %b5 )
+  %c6  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a6 , i16 %b6 )
+  %c7  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a7 , i16 %b7 )
+  %c8  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a8 , i16 %b8 )
+  %c9  = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a9 , i16 %b9 )
+  %c10 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a10, i16 %b10)
+  %c11 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a11, i16 %b11)
+  %c12 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a12, i16 %b12)
+  %c13 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a13, i16 %b13)
+  %c14 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a14, i16 %b14)
+  %c15 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a15, i16 %b15)
+  %c16 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a16, i16 %b16)
+  %c17 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a17, i16 %b17)
+  %c18 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a18, i16 %b18)
+  %c19 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a19, i16 %b19)
+  %c20 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a20, i16 %b20)
+  %c21 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a21, i16 %b21)
+  %c22 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a22, i16 %b22)
+  %c23 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a23, i16 %b23)
+  %c24 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a24, i16 %b24)
+  %c25 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a25, i16 %b25)
+  %c26 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a26, i16 %b26)
+  %c27 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a27, i16 %b27)
+  %c28 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a28, i16 %b28)
+  %c29 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a29, i16 %b29)
+  %c30 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a30, i16 %b30)
+  %c31 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %a31, i16 %b31)
+  %r0  = extractvalue {i16, i1} %c0 , 0
+  %r1  = extractvalue {i16, i1} %c1 , 0
+  %r2  = extractvalue {i16, i1} %c2 , 0
+  %r3  = extractvalue {i16, i1} %c3 , 0
+  %r4  = extractvalue {i16, i1} %c4 , 0
+  %r5  = extractvalue {i16, i1} %c5 , 0
+  %r6  = extractvalue {i16, i1} %c6 , 0
+  %r7  = extractvalue {i16, i1} %c7 , 0
+  %r8  = extractvalue {i16, i1} %c8 , 0
+  %r9  = extractvalue {i16, i1} %c9 , 0
+  %r10 = extractvalue {i16, i1} %c10, 0
+  %r11 = extractvalue {i16, i1} %c11, 0
+  %r12 = extractvalue {i16, i1} %c12, 0
+  %r13 = extractvalue {i16, i1} %c13, 0
+  %r14 = extractvalue {i16, i1} %c14, 0
+  %r15 = extractvalue {i16, i1} %c15, 0
+  %r16 = extractvalue {i16, i1} %c16, 0
+  %r17 = extractvalue {i16, i1} %c17, 0
+  %r18 = extractvalue {i16, i1} %c18, 0
+  %r19 = extractvalue {i16, i1} %c19, 0
+  %r20 = extractvalue {i16, i1} %c20, 0
+  %r21 = extractvalue {i16, i1} %c21, 0
+  %r22 = extractvalue {i16, i1} %c22, 0
+  %r23 = extractvalue {i16, i1} %c23, 0
+  %r24 = extractvalue {i16, i1} %c24, 0
+  %r25 = extractvalue {i16, i1} %c25, 0
+  %r26 = extractvalue {i16, i1} %c26, 0
+  %r27 = extractvalue {i16, i1} %c27, 0
+  %r28 = extractvalue {i16, i1} %c28, 0
+  %r29 = extractvalue {i16, i1} %c29, 0
+  %r30 = extractvalue {i16, i1} %c30, 0
+  %r31 = extractvalue {i16, i1} %c31, 0
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @mul_v64i8() {
+; CHECK-LABEL: @mul_v64i8(
+; CHECK-NEXT:    [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[C0:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT:    [[C32:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT:    [[C33:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT:    [[C34:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT:    [[C35:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT:    [[C36:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT:    [[C37:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT:    [[C38:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT:    [[C39:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT:    [[C40:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT:    [[C41:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT:    [[C42:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT:    [[C43:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT:    [[C44:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT:    [[C45:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT:    [[C46:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT:    [[C47:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT:    [[C48:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT:    [[C49:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT:    [[C50:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT:    [[C51:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT:    [[C52:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT:    [[C53:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT:    [[C54:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT:    [[C55:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT:    [[C56:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT:    [[C57:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT:    [[C58:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT:    [[C59:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT:    [[C60:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT:    [[C61:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT:    [[C62:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT:    [[C63:%.*]] = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT:    [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT:    [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT:    [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT:    [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT:    [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT:    [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT:    [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT:    [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT:    [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT:    [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT:    [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT:    [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT:    [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT:    [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT:    [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT:    [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT:    [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT:    [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT:    [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT:    [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT:    [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT:    [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT:    [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT:    [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT:    [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT:    [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT:    [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT:    [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT:    [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT:    [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT:    [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT:    [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT:    store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1
+; CHECK-NEXT:    store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1
+; CHECK-NEXT:    store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1
+; CHECK-NEXT:    store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+; CHECK-NEXT:    store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+; CHECK-NEXT:    store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+; CHECK-NEXT:    store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+; CHECK-NEXT:    store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+; CHECK-NEXT:    store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+; CHECK-NEXT:    store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+; CHECK-NEXT:    store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+; CHECK-NEXT:    store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+; CHECK-NEXT:    store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+; CHECK-NEXT:    store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+; CHECK-NEXT:    store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+; CHECK-NEXT:    store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+; CHECK-NEXT:    store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+; CHECK-NEXT:    store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+; CHECK-NEXT:    store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+; CHECK-NEXT:    store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+; CHECK-NEXT:    store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+; CHECK-NEXT:    store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+; CHECK-NEXT:    store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+; CHECK-NEXT:    store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+; CHECK-NEXT:    store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+; CHECK-NEXT:    store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+; CHECK-NEXT:    store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+; CHECK-NEXT:    store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+; CHECK-NEXT:    store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+; CHECK-NEXT:    store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+; CHECK-NEXT:    store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+; CHECK-NEXT:    store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+; CHECK-NEXT:    store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+; CHECK-NEXT:    store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+; CHECK-NEXT:    store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+; CHECK-NEXT:    store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+; CHECK-NEXT:    store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+; CHECK-NEXT:    store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+; CHECK-NEXT:    store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+; CHECK-NEXT:    store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+; CHECK-NEXT:    store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+; CHECK-NEXT:    store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+; CHECK-NEXT:    store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+; CHECK-NEXT:    store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+; CHECK-NEXT:    store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+; CHECK-NEXT:    store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+; CHECK-NEXT:    store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+; CHECK-NEXT:    store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+; CHECK-NEXT:    store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+; CHECK-NEXT:    store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+; CHECK-NEXT:    store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+; CHECK-NEXT:    store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+; CHECK-NEXT:    store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+; CHECK-NEXT:    store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+; CHECK-NEXT:    store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+; CHECK-NEXT:    store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+; CHECK-NEXT:    store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %c0  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a0 , i8 %b0 )
+  %c1  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a1 , i8 %b1 )
+  %c2  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a2 , i8 %b2 )
+  %c3  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a3 , i8 %b3 )
+  %c4  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a4 , i8 %b4 )
+  %c5  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a5 , i8 %b5 )
+  %c6  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a6 , i8 %b6 )
+  %c7  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a7 , i8 %b7 )
+  %c8  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a8 , i8 %b8 )
+  %c9  = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a9 , i8 %b9 )
+  %c10 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a10, i8 %b10)
+  %c11 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a11, i8 %b11)
+  %c12 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a12, i8 %b12)
+  %c13 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a13, i8 %b13)
+  %c14 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a14, i8 %b14)
+  %c15 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a15, i8 %b15)
+  %c16 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a16, i8 %b16)
+  %c17 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a17, i8 %b17)
+  %c18 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a18, i8 %b18)
+  %c19 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a19, i8 %b19)
+  %c20 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a20, i8 %b20)
+  %c21 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a21, i8 %b21)
+  %c22 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a22, i8 %b22)
+  %c23 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a23, i8 %b23)
+  %c24 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a24, i8 %b24)
+  %c25 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a25, i8 %b25)
+  %c26 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a26, i8 %b26)
+  %c27 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a27, i8 %b27)
+  %c28 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a28, i8 %b28)
+  %c29 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a29, i8 %b29)
+  %c30 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a30, i8 %b30)
+  %c31 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a31, i8 %b31)
+  %c32 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a32, i8 %b32)
+  %c33 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a33, i8 %b33)
+  %c34 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a34, i8 %b34)
+  %c35 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a35, i8 %b35)
+  %c36 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a36, i8 %b36)
+  %c37 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a37, i8 %b37)
+  %c38 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a38, i8 %b38)
+  %c39 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a39, i8 %b39)
+  %c40 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a40, i8 %b40)
+  %c41 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a41, i8 %b41)
+  %c42 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a42, i8 %b42)
+  %c43 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a43, i8 %b43)
+  %c44 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a44, i8 %b44)
+  %c45 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a45, i8 %b45)
+  %c46 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a46, i8 %b46)
+  %c47 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a47, i8 %b47)
+  %c48 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a48, i8 %b48)
+  %c49 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a49, i8 %b49)
+  %c50 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a50, i8 %b50)
+  %c51 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a51, i8 %b51)
+  %c52 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a52, i8 %b52)
+  %c53 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a53, i8 %b53)
+  %c54 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a54, i8 %b54)
+  %c55 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a55, i8 %b55)
+  %c56 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a56, i8 %b56)
+  %c57 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a57, i8 %b57)
+  %c58 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a58, i8 %b58)
+  %c59 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a59, i8 %b59)
+  %c60 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a60, i8 %b60)
+  %c61 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a61, i8 %b61)
+  %c62 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a62, i8 %b62)
+  %c63 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %a63, i8 %b63)
+  %r0  = extractvalue {i8, i1} %c0 , 0
+  %r1  = extractvalue {i8, i1} %c1 , 0
+  %r2  = extractvalue {i8, i1} %c2 , 0
+  %r3  = extractvalue {i8, i1} %c3 , 0
+  %r4  = extractvalue {i8, i1} %c4 , 0
+  %r5  = extractvalue {i8, i1} %c5 , 0
+  %r6  = extractvalue {i8, i1} %c6 , 0
+  %r7  = extractvalue {i8, i1} %c7 , 0
+  %r8  = extractvalue {i8, i1} %c8 , 0
+  %r9  = extractvalue {i8, i1} %c9 , 0
+  %r10 = extractvalue {i8, i1} %c10, 0
+  %r11 = extractvalue {i8, i1} %c11, 0
+  %r12 = extractvalue {i8, i1} %c12, 0
+  %r13 = extractvalue {i8, i1} %c13, 0
+  %r14 = extractvalue {i8, i1} %c14, 0
+  %r15 = extractvalue {i8, i1} %c15, 0
+  %r16 = extractvalue {i8, i1} %c16, 0
+  %r17 = extractvalue {i8, i1} %c17, 0
+  %r18 = extractvalue {i8, i1} %c18, 0
+  %r19 = extractvalue {i8, i1} %c19, 0
+  %r20 = extractvalue {i8, i1} %c20, 0
+  %r21 = extractvalue {i8, i1} %c21, 0
+  %r22 = extractvalue {i8, i1} %c22, 0
+  %r23 = extractvalue {i8, i1} %c23, 0
+  %r24 = extractvalue {i8, i1} %c24, 0
+  %r25 = extractvalue {i8, i1} %c25, 0
+  %r26 = extractvalue {i8, i1} %c26, 0
+  %r27 = extractvalue {i8, i1} %c27, 0
+  %r28 = extractvalue {i8, i1} %c28, 0
+  %r29 = extractvalue {i8, i1} %c29, 0
+  %r30 = extractvalue {i8, i1} %c30, 0
+  %r31 = extractvalue {i8, i1} %c31, 0
+  %r32 = extractvalue {i8, i1} %c32, 0
+  %r33 = extractvalue {i8, i1} %c33, 0
+  %r34 = extractvalue {i8, i1} %c34, 0
+  %r35 = extractvalue {i8, i1} %c35, 0
+  %r36 = extractvalue {i8, i1} %c36, 0
+  %r37 = extractvalue {i8, i1} %c37, 0
+  %r38 = extractvalue {i8, i1} %c38, 0
+  %r39 = extractvalue {i8, i1} %c39, 0
+  %r40 = extractvalue {i8, i1} %c40, 0
+  %r41 = extractvalue {i8, i1} %c41, 0
+  %r42 = extractvalue {i8, i1} %c42, 0
+  %r43 = extractvalue {i8, i1} %c43, 0
+  %r44 = extractvalue {i8, i1} %c44, 0
+  %r45 = extractvalue {i8, i1} %c45, 0
+  %r46 = extractvalue {i8, i1} %c46, 0
+  %r47 = extractvalue {i8, i1} %c47, 0
+  %r48 = extractvalue {i8, i1} %c48, 0
+  %r49 = extractvalue {i8, i1} %c49, 0
+  %r50 = extractvalue {i8, i1} %c50, 0
+  %r51 = extractvalue {i8, i1} %c51, 0
+  %r52 = extractvalue {i8, i1} %c52, 0
+  %r53 = extractvalue {i8, i1} %c53, 0
+  %r54 = extractvalue {i8, i1} %c54, 0
+  %r55 = extractvalue {i8, i1} %c55, 0
+  %r56 = extractvalue {i8, i1} %c56, 0
+  %r57 = extractvalue {i8, i1} %c57, 0
+  %r58 = extractvalue {i8, i1} %c58, 0
+  %r59 = extractvalue {i8, i1} %c59, 0
+  %r60 = extractvalue {i8, i1} %c60, 0
+  %r61 = extractvalue {i8, i1} %c61, 0
+  %r62 = extractvalue {i8, i1} %c62, 0
+  %r63 = extractvalue {i8, i1} %c63, 0
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll
new file mode 100644
index 00000000000..d9f9fae3e74
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul-umulo.ll
@@ -0,0 +1,1254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32)
+declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16)
+declare {i8 , i1} @llvm.umul.with.overflow.i8 (i8 , i8 )
+
+define void @mul_v8i64() {
+; CHECK-LABEL: @mul_v8i64(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[C0:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; CHECK-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; CHECK-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; CHECK-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; CHECK-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; CHECK-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; CHECK-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %c0 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a0, i64 %b0)
+  %c1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a1, i64 %b1)
+  %c2 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a2, i64 %b2)
+  %c3 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a3, i64 %b3)
+  %c4 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a4, i64 %b4)
+  %c5 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a5, i64 %b5)
+  %c6 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a6, i64 %b6)
+  %c7 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %a7, i64 %b7)
+  %r0 = extractvalue {i64, i1} %c0, 0
+  %r1 = extractvalue {i64, i1} %c1, 0
+  %r2 = extractvalue {i64, i1} %c2, 0
+  %r3 = extractvalue {i64, i1} %c3, 0
+  %r4 = extractvalue {i64, i1} %c4, 0
+  %r5 = extractvalue {i64, i1} %c5, 0
+  %r6 = extractvalue {i64, i1} %c6, 0
+  %r7 = extractvalue {i64, i1} %c7, 0
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @mul_v16i32() {
+; CHECK-LABEL: @mul_v16i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[C0:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; CHECK-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; CHECK-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; CHECK-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; CHECK-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; CHECK-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; CHECK-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; CHECK-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; CHECK-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; CHECK-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; CHECK-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; CHECK-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; CHECK-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %c0  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a0 , i32 %b0 )
+  %c1  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a1 , i32 %b1 )
+  %c2  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a2 , i32 %b2 )
+  %c3  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a3 , i32 %b3 )
+  %c4  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a4 , i32 %b4 )
+  %c5  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a5 , i32 %b5 )
+  %c6  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a6 , i32 %b6 )
+  %c7  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a7 , i32 %b7 )
+  %c8  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a8 , i32 %b8 )
+  %c9  = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a9 , i32 %b9 )
+  %c10 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a10, i32 %b10)
+  %c11 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a11, i32 %b11)
+  %c12 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a12, i32 %b12)
+  %c13 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a13, i32 %b13)
+  %c14 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a14, i32 %b14)
+  %c15 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %a15, i32 %b15)
+  %r0  = extractvalue {i32, i1} %c0 , 0
+  %r1  = extractvalue {i32, i1} %c1 , 0
+  %r2  = extractvalue {i32, i1} %c2 , 0
+  %r3  = extractvalue {i32, i1} %c3 , 0
+  %r4  = extractvalue {i32, i1} %c4 , 0
+  %r5  = extractvalue {i32, i1} %c5 , 0
+  %r6  = extractvalue {i32, i1} %c6 , 0
+  %r7  = extractvalue {i32, i1} %c7 , 0
+  %r8  = extractvalue {i32, i1} %c8 , 0
+  %r9  = extractvalue {i32, i1} %c9 , 0
+  %r10 = extractvalue {i32, i1} %c10, 0
+  %r11 = extractvalue {i32, i1} %c11, 0
+  %r12 = extractvalue {i32, i1} %c12, 0
+  %r13 = extractvalue {i32, i1} %c13, 0
+  %r14 = extractvalue {i32, i1} %c14, 0
+  %r15 = extractvalue {i32, i1} %c15, 0
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @mul_v32i16() {
+; CHECK-LABEL: @mul_v32i16(
+; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[C0:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; CHECK-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; CHECK-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; CHECK-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; CHECK-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; CHECK-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; CHECK-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; CHECK-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; CHECK-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; CHECK-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; CHECK-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; CHECK-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; CHECK-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; CHECK-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; CHECK-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; CHECK-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; CHECK-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; CHECK-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; CHECK-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; CHECK-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; CHECK-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; CHECK-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; CHECK-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; CHECK-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; CHECK-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; CHECK-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; CHECK-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; CHECK-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; CHECK-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; CHECK-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; CHECK-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; CHECK-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %c0  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a0 , i16 %b0 )
+  %c1  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a1 , i16 %b1 )
+  %c2  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a2 , i16 %b2 )
+  %c3  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a3 , i16 %b3 )
+  %c4  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a4 , i16 %b4 )
+  %c5  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a5 , i16 %b5 )
+  %c6  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a6 , i16 %b6 )
+  %c7  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a7 , i16 %b7 )
+  %c8  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a8 , i16 %b8 )
+  %c9  = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a9 , i16 %b9 )
+  %c10 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a10, i16 %b10)
+  %c11 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a11, i16 %b11)
+  %c12 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a12, i16 %b12)
+  %c13 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a13, i16 %b13)
+  %c14 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a14, i16 %b14)
+  %c15 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a15, i16 %b15)
+  %c16 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a16, i16 %b16)
+  %c17 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a17, i16 %b17)
+  %c18 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a18, i16 %b18)
+  %c19 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a19, i16 %b19)
+  %c20 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a20, i16 %b20)
+  %c21 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a21, i16 %b21)
+  %c22 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a22, i16 %b22)
+  %c23 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a23, i16 %b23)
+  %c24 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a24, i16 %b24)
+  %c25 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a25, i16 %b25)
+  %c26 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a26, i16 %b26)
+  %c27 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a27, i16 %b27)
+  %c28 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a28, i16 %b28)
+  %c29 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a29, i16 %b29)
+  %c30 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a30, i16 %b30)
+  %c31 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %a31, i16 %b31)
+  %r0  = extractvalue {i16, i1} %c0 , 0
+  %r1  = extractvalue {i16, i1} %c1 , 0
+  %r2  = extractvalue {i16, i1} %c2 , 0
+  %r3  = extractvalue {i16, i1} %c3 , 0
+  %r4  = extractvalue {i16, i1} %c4 , 0
+  %r5  = extractvalue {i16, i1} %c5 , 0
+  %r6  = extractvalue {i16, i1} %c6 , 0
+  %r7  = extractvalue {i16, i1} %c7 , 0
+  %r8  = extractvalue {i16, i1} %c8 , 0
+  %r9  = extractvalue {i16, i1} %c9 , 0
+  %r10 = extractvalue {i16, i1} %c10, 0
+  %r11 = extractvalue {i16, i1} %c11, 0
+  %r12 = extractvalue {i16, i1} %c12, 0
+  %r13 = extractvalue {i16, i1} %c13, 0
+  %r14 = extractvalue {i16, i1} %c14, 0
+  %r15 = extractvalue {i16, i1} %c15, 0
+  %r16 = extractvalue {i16, i1} %c16, 0
+  %r17 = extractvalue {i16, i1} %c17, 0
+  %r18 = extractvalue {i16, i1} %c18, 0
+  %r19 = extractvalue {i16, i1} %c19, 0
+  %r20 = extractvalue {i16, i1} %c20, 0
+  %r21 = extractvalue {i16, i1} %c21, 0
+  %r22 = extractvalue {i16, i1} %c22, 0
+  %r23 = extractvalue {i16, i1} %c23, 0
+  %r24 = extractvalue {i16, i1} %c24, 0
+  %r25 = extractvalue {i16, i1} %c25, 0
+  %r26 = extractvalue {i16, i1} %c26, 0
+  %r27 = extractvalue {i16, i1} %c27, 0
+  %r28 = extractvalue {i16, i1} %c28, 0
+  %r29 = extractvalue {i16, i1} %c29, 0
+  %r30 = extractvalue {i16, i1} %c30, 0
+  %r31 = extractvalue {i16, i1} %c31, 0
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @mul_v64i8() {
+; CHECK-LABEL: @mul_v64i8(
+; CHECK-NEXT:    [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[C0:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT:    [[C32:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT:    [[C33:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT:    [[C34:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT:    [[C35:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT:    [[C36:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT:    [[C37:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT:    [[C38:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT:    [[C39:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT:    [[C40:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT:    [[C41:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT:    [[C42:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT:    [[C43:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT:    [[C44:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT:    [[C45:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT:    [[C46:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT:    [[C47:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT:    [[C48:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT:    [[C49:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT:    [[C50:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT:    [[C51:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT:    [[C52:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT:    [[C53:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT:    [[C54:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT:    [[C55:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT:    [[C56:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT:    [[C57:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT:    [[C58:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT:    [[C59:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT:    [[C60:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT:    [[C61:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT:    [[C62:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT:    [[C63:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT:    [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT:    [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT:    [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT:    [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT:    [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT:    [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT:    [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT:    [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT:    [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT:    [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT:    [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT:    [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT:    [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT:    [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT:    [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT:    [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT:    [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT:    [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT:    [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT:    [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT:    [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT:    [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT:    [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT:    [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT:    [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT:    [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT:    [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT:    [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT:    [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT:    [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT:    [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT:    [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT:    store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1
+; CHECK-NEXT:    store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1
+; CHECK-NEXT:    store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1
+; CHECK-NEXT:    store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+; CHECK-NEXT:    store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+; CHECK-NEXT:    store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+; CHECK-NEXT:    store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+; CHECK-NEXT:    store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+; CHECK-NEXT:    store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+; CHECK-NEXT:    store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+; CHECK-NEXT:    store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+; CHECK-NEXT:    store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+; CHECK-NEXT:    store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+; CHECK-NEXT:    store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+; CHECK-NEXT:    store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+; CHECK-NEXT:    store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+; CHECK-NEXT:    store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+; CHECK-NEXT:    store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+; CHECK-NEXT:    store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+; CHECK-NEXT:    store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+; CHECK-NEXT:    store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+; CHECK-NEXT:    store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+; CHECK-NEXT:    store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+; CHECK-NEXT:    store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+; CHECK-NEXT:    store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+; CHECK-NEXT:    store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+; CHECK-NEXT:    store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+; CHECK-NEXT:    store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+; CHECK-NEXT:    store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+; CHECK-NEXT:    store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+; CHECK-NEXT:    store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+; CHECK-NEXT:    store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+; CHECK-NEXT:    store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+; CHECK-NEXT:    store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+; CHECK-NEXT:    store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+; CHECK-NEXT:    store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+; CHECK-NEXT:    store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+; CHECK-NEXT:    store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+; CHECK-NEXT:    store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+; CHECK-NEXT:    store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+; CHECK-NEXT:    store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+; CHECK-NEXT:    store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+; CHECK-NEXT:    store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+; CHECK-NEXT:    store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+; CHECK-NEXT:    store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+; CHECK-NEXT:    store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+; CHECK-NEXT:    store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+; CHECK-NEXT:    store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+; CHECK-NEXT:    store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+; CHECK-NEXT:    store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+; CHECK-NEXT:    store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+; CHECK-NEXT:    store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+; CHECK-NEXT:    store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+; CHECK-NEXT:    store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+; CHECK-NEXT:    store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+; CHECK-NEXT:    store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+; CHECK-NEXT:    store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %c0  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a0 , i8 %b0 )
+  %c1  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a1 , i8 %b1 )
+  %c2  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a2 , i8 %b2 )
+  %c3  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a3 , i8 %b3 )
+  %c4  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a4 , i8 %b4 )
+  %c5  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a5 , i8 %b5 )
+  %c6  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a6 , i8 %b6 )
+  %c7  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a7 , i8 %b7 )
+  %c8  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a8 , i8 %b8 )
+  %c9  = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a9 , i8 %b9 )
+  %c10 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a10, i8 %b10)
+  %c11 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a11, i8 %b11)
+  %c12 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a12, i8 %b12)
+  %c13 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a13, i8 %b13)
+  %c14 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a14, i8 %b14)
+  %c15 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a15, i8 %b15)
+  %c16 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a16, i8 %b16)
+  %c17 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a17, i8 %b17)
+  %c18 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a18, i8 %b18)
+  %c19 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a19, i8 %b19)
+  %c20 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a20, i8 %b20)
+  %c21 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a21, i8 %b21)
+  %c22 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a22, i8 %b22)
+  %c23 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a23, i8 %b23)
+  %c24 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a24, i8 %b24)
+  %c25 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a25, i8 %b25)
+  %c26 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a26, i8 %b26)
+  %c27 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a27, i8 %b27)
+  %c28 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a28, i8 %b28)
+  %c29 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a29, i8 %b29)
+  %c30 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a30, i8 %b30)
+  %c31 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a31, i8 %b31)
+  %c32 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a32, i8 %b32)
+  %c33 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a33, i8 %b33)
+  %c34 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a34, i8 %b34)
+  %c35 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a35, i8 %b35)
+  %c36 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a36, i8 %b36)
+  %c37 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a37, i8 %b37)
+  %c38 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a38, i8 %b38)
+  %c39 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a39, i8 %b39)
+  %c40 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a40, i8 %b40)
+  %c41 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a41, i8 %b41)
+  %c42 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a42, i8 %b42)
+  %c43 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a43, i8 %b43)
+  %c44 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a44, i8 %b44)
+  %c45 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a45, i8 %b45)
+  %c46 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a46, i8 %b46)
+  %c47 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a47, i8 %b47)
+  %c48 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a48, i8 %b48)
+  %c49 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a49, i8 %b49)
+  %c50 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a50, i8 %b50)
+  %c51 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a51, i8 %b51)
+  %c52 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a52, i8 %b52)
+  %c53 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a53, i8 %b53)
+  %c54 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a54, i8 %b54)
+  %c55 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a55, i8 %b55)
+  %c56 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a56, i8 %b56)
+  %c57 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a57, i8 %b57)
+  %c58 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a58, i8 %b58)
+  %c59 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a59, i8 %b59)
+  %c60 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a60, i8 %b60)
+  %c61 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a61, i8 %b61)
+  %c62 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a62, i8 %b62)
+  %c63 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %a63, i8 %b63)
+  %r0  = extractvalue {i8, i1} %c0 , 0
+  %r1  = extractvalue {i8, i1} %c1 , 0
+  %r2  = extractvalue {i8, i1} %c2 , 0
+  %r3  = extractvalue {i8, i1} %c3 , 0
+  %r4  = extractvalue {i8, i1} %c4 , 0
+  %r5  = extractvalue {i8, i1} %c5 , 0
+  %r6  = extractvalue {i8, i1} %c6 , 0
+  %r7  = extractvalue {i8, i1} %c7 , 0
+  %r8  = extractvalue {i8, i1} %c8 , 0
+  %r9  = extractvalue {i8, i1} %c9 , 0
+  %r10 = extractvalue {i8, i1} %c10, 0
+  %r11 = extractvalue {i8, i1} %c11, 0
+  %r12 = extractvalue {i8, i1} %c12, 0
+  %r13 = extractvalue {i8, i1} %c13, 0
+  %r14 = extractvalue {i8, i1} %c14, 0
+  %r15 = extractvalue {i8, i1} %c15, 0
+  %r16 = extractvalue {i8, i1} %c16, 0
+  %r17 = extractvalue {i8, i1} %c17, 0
+  %r18 = extractvalue {i8, i1} %c18, 0
+  %r19 = extractvalue {i8, i1} %c19, 0
+  %r20 = extractvalue {i8, i1} %c20, 0
+  %r21 = extractvalue {i8, i1} %c21, 0
+  %r22 = extractvalue {i8, i1} %c22, 0
+  %r23 = extractvalue {i8, i1} %c23, 0
+  %r24 = extractvalue {i8, i1} %c24, 0
+  %r25 = extractvalue {i8, i1} %c25, 0
+  %r26 = extractvalue {i8, i1} %c26, 0
+  %r27 = extractvalue {i8, i1} %c27, 0
+  %r28 = extractvalue {i8, i1} %c28, 0
+  %r29 = extractvalue {i8, i1} %c29, 0
+  %r30 = extractvalue {i8, i1} %c30, 0
+  %r31 = extractvalue {i8, i1} %c31, 0
+  %r32 = extractvalue {i8, i1} %c32, 0
+  %r33 = extractvalue {i8, i1} %c33, 0
+  %r34 = extractvalue {i8, i1} %c34, 0
+  %r35 = extractvalue {i8, i1} %c35, 0
+  %r36 = extractvalue {i8, i1} %c36, 0
+  %r37 = extractvalue {i8, i1} %c37, 0
+  %r38 = extractvalue {i8, i1} %c38, 0
+  %r39 = extractvalue {i8, i1} %c39, 0
+  %r40 = extractvalue {i8, i1} %c40, 0
+  %r41 = extractvalue {i8, i1} %c41, 0
+  %r42 = extractvalue {i8, i1} %c42, 0
+  %r43 = extractvalue {i8, i1} %c43, 0
+  %r44 = extractvalue {i8, i1} %c44, 0
+  %r45 = extractvalue {i8, i1} %c45, 0
+  %r46 = extractvalue {i8, i1} %c46, 0
+  %r47 = extractvalue {i8, i1} %c47, 0
+  %r48 = extractvalue {i8, i1} %c48, 0
+  %r49 = extractvalue {i8, i1} %c49, 0
+  %r50 = extractvalue {i8, i1} %c50, 0
+  %r51 = extractvalue {i8, i1} %c51, 0
+  %r52 = extractvalue {i8, i1} %c52, 0
+  %r53 = extractvalue {i8, i1} %c53, 0
+  %r54 = extractvalue {i8, i1} %c54, 0
+  %r55 = extractvalue {i8, i1} %c55, 0
+  %r56 = extractvalue {i8, i1} %c56, 0
+  %r57 = extractvalue {i8, i1} %c57, 0
+  %r58 = extractvalue {i8, i1} %c58, 0
+  %r59 = extractvalue {i8, i1} %c59, 0
+  %r60 = extractvalue {i8, i1} %c60, 0
+  %r61 = extractvalue {i8, i1} %c61, 0
+  %r62 = extractvalue {i8, i1} %c62, 0
+  %r63 = extractvalue {i8, i1} %c63, 0
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
new file mode 100644
index 00000000000..8a01c85bcdb
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll
@@ -0,0 +1,775 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @mul_v8i64() {
+; SSE-LABEL: @mul_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @mul_v8i64(
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; SLM-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; SLM-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; SLM-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; SLM-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; SLM-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; SLM-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; SLM-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    ret void
+;
+; AVX1-LABEL: @mul_v8i64(
+; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[R0:%.*]] = mul i64 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = mul i64 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = mul i64 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = mul i64 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = mul i64 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = mul i64 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = mul i64 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = mul i64 [[A7]], [[B7]]
+; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @mul_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @mul_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = mul <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = mul i64 %a0, %b0
+  %r1 = mul i64 %a1, %b1
+  %r2 = mul i64 %a2, %b2
+  %r3 = mul i64 %a3, %b3
+  %r4 = mul i64 %a4, %b4
+  %r5 = mul i64 %a5, %b5
+  %r6 = mul i64 %a6, %b6
+  %r7 = mul i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @mul_v16i32() {
+; SSE-LABEL: @mul_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @mul_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @mul_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @mul_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = mul <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = mul i32 %a0 , %b0
+  %r1  = mul i32 %a1 , %b1
+  %r2  = mul i32 %a2 , %b2
+  %r3  = mul i32 %a3 , %b3
+  %r4  = mul i32 %a4 , %b4
+  %r5  = mul i32 %a5 , %b5
+  %r6  = mul i32 %a6 , %b6
+  %r7  = mul i32 %a7 , %b7
+  %r8  = mul i32 %a8 , %b8
+  %r9  = mul i32 %a9 , %b9
+  %r10 = mul i32 %a10, %b10
+  %r11 = mul i32 %a11, %b11
+  %r12 = mul i32 %a12, %b12
+  %r13 = mul i32 %a13, %b13
+  %r14 = mul i32 %a14, %b14
+  %r15 = mul i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @mul_v32i16() {
+; SSE-LABEL: @mul_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @mul_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @mul_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @mul_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = mul i16 %a0 , %b0
+  %r1  = mul i16 %a1 , %b1
+  %r2  = mul i16 %a2 , %b2
+  %r3  = mul i16 %a3 , %b3
+  %r4  = mul i16 %a4 , %b4
+  %r5  = mul i16 %a5 , %b5
+  %r6  = mul i16 %a6 , %b6
+  %r7  = mul i16 %a7 , %b7
+  %r8  = mul i16 %a8 , %b8
+  %r9  = mul i16 %a9 , %b9
+  %r10 = mul i16 %a10, %b10
+  %r11 = mul i16 %a11, %b11
+  %r12 = mul i16 %a12, %b12
+  %r13 = mul i16 %a13, %b13
+  %r14 = mul i16 %a14, %b14
+  %r15 = mul i16 %a15, %b15
+  %r16 = mul i16 %a16, %b16
+  %r17 = mul i16 %a17, %b17
+  %r18 = mul i16 %a18, %b18
+  %r19 = mul i16 %a19, %b19
+  %r20 = mul i16 %a20, %b20
+  %r21 = mul i16 %a21, %b21
+  %r22 = mul i16 %a22, %b22
+  %r23 = mul i16 %a23, %b23
+  %r24 = mul i16 %a24, %b24
+  %r25 = mul i16 %a25, %b25
+  %r26 = mul i16 %a26, %b26
+  %r27 = mul i16 %a27, %b27
+  %r28 = mul i16 %a28, %b28
+  %r29 = mul i16 %a29, %b29
+  %r30 = mul i16 %a30, %b30
+  %r31 = mul i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @mul_v64i8() {
+; CHECK-LABEL: @mul_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = mul i8 %a0 , %b0
+  %r1  = mul i8 %a1 , %b1
+  %r2  = mul i8 %a2 , %b2
+  %r3  = mul i8 %a3 , %b3
+  %r4  = mul i8 %a4 , %b4
+  %r5  = mul i8 %a5 , %b5
+  %r6  = mul i8 %a6 , %b6
+  %r7  = mul i8 %a7 , %b7
+  %r8  = mul i8 %a8 , %b8
+  %r9  = mul i8 %a9 , %b9
+  %r10 = mul i8 %a10, %b10
+  %r11 = mul i8 %a11, %b11
+  %r12 = mul i8 %a12, %b12
+  %r13 = mul i8 %a13, %b13
+  %r14 = mul i8 %a14, %b14
+  %r15 = mul i8 %a15, %b15
+  %r16 = mul i8 %a16, %b16
+  %r17 = mul i8 %a17, %b17
+  %r18 = mul i8 %a18, %b18
+  %r19 = mul i8 %a19, %b19
+  %r20 = mul i8 %a20, %b20
+  %r21 = mul i8 %a21, %b21
+  %r22 = mul i8 %a22, %b22
+  %r23 = mul i8 %a23, %b23
+  %r24 = mul i8 %a24, %b24
+  %r25 = mul i8 %a25, %b25
+  %r26 = mul i8 %a26, %b26
+  %r27 = mul i8 %a27, %b27
+  %r28 = mul i8 %a28, %b28
+  %r29 = mul i8 %a29, %b29
+  %r30 = mul i8 %a30, %b30
+  %r31 = mul i8 %a31, %b31
+  %r32 = mul i8 %a32, %b32
+  %r33 = mul i8 %a33, %b33
+  %r34 = mul i8 %a34, %b34
+  %r35 = mul i8 %a35, %b35
+  %r36 = mul i8 %a36, %b36
+  %r37 = mul i8 %a37, %b37
+  %r38 = mul i8 %a38, %b38
+  %r39 = mul i8 %a39, %b39
+  %r40 = mul i8 %a40, %b40
+  %r41 = mul i8 %a41, %b41
+  %r42 = mul i8 %a42, %b42
+  %r43 = mul i8 %a43, %b43
+  %r44 = mul i8 %a44, %b44
+  %r45 = mul i8 %a45, %b45
+  %r46 = mul i8 %a46, %b46
+  %r47 = mul i8 %a47, %b47
+  %r48 = mul i8 %a48, %b48
+  %r49 = mul i8 %a49, %b49
+  %r50 = mul i8 %a50, %b50
+  %r51 = mul i8 %a51, %b51
+  %r52 = mul i8 %a52, %b52
+  %r53 = mul i8 %a53, %b53
+  %r54 = mul i8 %a54, %b54
+  %r55 = mul i8 %a55, %b55
+  %r56 = mul i8 %a56, %b56
+  %r57 = mul i8 %a57, %b57
+  %r58 = mul i8 %a58, %b58
+  %r59 = mul i8 %a59, %b59
+  %r60 = mul i8 %a60, %b60
+  %r61 = mul i8 %a61, %b61
+  %r62 = mul i8 %a62, %b62
+  %r63 = mul i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
new file mode 100644
index 00000000000..31c8c4243f1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll
@@ -0,0 +1,775 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare i64 @llvm.ssub.sat.i64(i64, i64)
+declare i32 @llvm.ssub.sat.i32(i32, i32)
+declare i16 @llvm.ssub.sat.i16(i16, i16)
+declare i8  @llvm.ssub.sat.i8 (i8 , i8 )
+
+define void @sub_v8i64() {
+; SSE-LABEL: @sub_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v8i64(
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A0]], i64 [[B0]])
+; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A1]], i64 [[B1]])
+; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A2]], i64 [[B2]])
+; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A3]], i64 [[B3]])
+; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A4]], i64 [[B4]])
+; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A5]], i64 [[B5]])
+; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A6]], i64 [[B6]])
+; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.ssub.sat.i64(i64 [[A7]], i64 [[B7]])
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SLM-NEXT:    ret void
+;
+; AVX1-LABEL: @sub_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; AVX1-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; AVX1-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; AVX1-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @sub_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256BW-LABEL: @sub_v8i64(
+; AVX256BW-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX256BW-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX256BW-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX256BW-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256BW-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = call i64 @llvm.ssub.sat.i64(i64 %a0, i64 %b0)
+  %r1 = call i64 @llvm.ssub.sat.i64(i64 %a1, i64 %b1)
+  %r2 = call i64 @llvm.ssub.sat.i64(i64 %a2, i64 %b2)
+  %r3 = call i64 @llvm.ssub.sat.i64(i64 %a3, i64 %b3)
+  %r4 = call i64 @llvm.ssub.sat.i64(i64 %a4, i64 %b4)
+  %r5 = call i64 @llvm.ssub.sat.i64(i64 %a5, i64 %b5)
+  %r6 = call i64 @llvm.ssub.sat.i64(i64 %a6, i64 %b6)
+  %r7 = call i64 @llvm.ssub.sat.i64(i64 %a7, i64 %b7)
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sub_v16i32() {
+; SSE-LABEL: @sub_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = call i32 @llvm.ssub.sat.i32(i32 %a0 , i32 %b0 )
+  %r1  = call i32 @llvm.ssub.sat.i32(i32 %a1 , i32 %b1 )
+  %r2  = call i32 @llvm.ssub.sat.i32(i32 %a2 , i32 %b2 )
+  %r3  = call i32 @llvm.ssub.sat.i32(i32 %a3 , i32 %b3 )
+  %r4  = call i32 @llvm.ssub.sat.i32(i32 %a4 , i32 %b4 )
+  %r5  = call i32 @llvm.ssub.sat.i32(i32 %a5 , i32 %b5 )
+  %r6  = call i32 @llvm.ssub.sat.i32(i32 %a6 , i32 %b6 )
+  %r7  = call i32 @llvm.ssub.sat.i32(i32 %a7 , i32 %b7 )
+  %r8  = call i32 @llvm.ssub.sat.i32(i32 %a8 , i32 %b8 )
+  %r9  = call i32 @llvm.ssub.sat.i32(i32 %a9 , i32 %b9 )
+  %r10 = call i32 @llvm.ssub.sat.i32(i32 %a10, i32 %b10)
+  %r11 = call i32 @llvm.ssub.sat.i32(i32 %a11, i32 %b11)
+  %r12 = call i32 @llvm.ssub.sat.i32(i32 %a12, i32 %b12)
+  %r13 = call i32 @llvm.ssub.sat.i32(i32 %a13, i32 %b13)
+  %r14 = call i32 @llvm.ssub.sat.i32(i32 %a14, i32 %b14)
+  %r15 = call i32 @llvm.ssub.sat.i32(i32 %a15, i32 %b15)
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sub_v32i16() {
+; SSE-LABEL: @sub_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX512-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = call i16 @llvm.ssub.sat.i16(i16 %a0 , i16 %b0 )
+  %r1  = call i16 @llvm.ssub.sat.i16(i16 %a1 , i16 %b1 )
+  %r2  = call i16 @llvm.ssub.sat.i16(i16 %a2 , i16 %b2 )
+  %r3  = call i16 @llvm.ssub.sat.i16(i16 %a3 , i16 %b3 )
+  %r4  = call i16 @llvm.ssub.sat.i16(i16 %a4 , i16 %b4 )
+  %r5  = call i16 @llvm.ssub.sat.i16(i16 %a5 , i16 %b5 )
+  %r6  = call i16 @llvm.ssub.sat.i16(i16 %a6 , i16 %b6 )
+  %r7  = call i16 @llvm.ssub.sat.i16(i16 %a7 , i16 %b7 )
+  %r8  = call i16 @llvm.ssub.sat.i16(i16 %a8 , i16 %b8 )
+  %r9  = call i16 @llvm.ssub.sat.i16(i16 %a9 , i16 %b9 )
+  %r10 = call i16 @llvm.ssub.sat.i16(i16 %a10, i16 %b10)
+  %r11 = call i16 @llvm.ssub.sat.i16(i16 %a11, i16 %b11)
+  %r12 = call i16 @llvm.ssub.sat.i16(i16 %a12, i16 %b12)
+  %r13 = call i16 @llvm.ssub.sat.i16(i16 %a13, i16 %b13)
+  %r14 = call i16 @llvm.ssub.sat.i16(i16 %a14, i16 %b14)
+  %r15 = call i16 @llvm.ssub.sat.i16(i16 %a15, i16 %b15)
+  %r16 = call i16 @llvm.ssub.sat.i16(i16 %a16, i16 %b16)
+  %r17 = call i16 @llvm.ssub.sat.i16(i16 %a17, i16 %b17)
+  %r18 = call i16 @llvm.ssub.sat.i16(i16 %a18, i16 %b18)
+  %r19 = call i16 @llvm.ssub.sat.i16(i16 %a19, i16 %b19)
+  %r20 = call i16 @llvm.ssub.sat.i16(i16 %a20, i16 %b20)
+  %r21 = call i16 @llvm.ssub.sat.i16(i16 %a21, i16 %b21)
+  %r22 = call i16 @llvm.ssub.sat.i16(i16 %a22, i16 %b22)
+  %r23 = call i16 @llvm.ssub.sat.i16(i16 %a23, i16 %b23)
+  %r24 = call i16 @llvm.ssub.sat.i16(i16 %a24, i16 %b24)
+  %r25 = call i16 @llvm.ssub.sat.i16(i16 %a25, i16 %b25)
+  %r26 = call i16 @llvm.ssub.sat.i16(i16 %a26, i16 %b26)
+  %r27 = call i16 @llvm.ssub.sat.i16(i16 %a27, i16 %b27)
+  %r28 = call i16 @llvm.ssub.sat.i16(i16 %a28, i16 %b28)
+  %r29 = call i16 @llvm.ssub.sat.i16(i16 %a29, i16 %b29)
+  %r30 = call i16 @llvm.ssub.sat.i16(i16 %a30, i16 %b30)
+  %r31 = call i16 @llvm.ssub.sat.i16(i16 %a31, i16 %b31)
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @sub_v64i8() {
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = call i8 @llvm.ssub.sat.i8(i8 %a0 , i8 %b0 )
+  %r1  = call i8 @llvm.ssub.sat.i8(i8 %a1 , i8 %b1 )
+  %r2  = call i8 @llvm.ssub.sat.i8(i8 %a2 , i8 %b2 )
+  %r3  = call i8 @llvm.ssub.sat.i8(i8 %a3 , i8 %b3 )
+  %r4  = call i8 @llvm.ssub.sat.i8(i8 %a4 , i8 %b4 )
+  %r5  = call i8 @llvm.ssub.sat.i8(i8 %a5 , i8 %b5 )
+  %r6  = call i8 @llvm.ssub.sat.i8(i8 %a6 , i8 %b6 )
+  %r7  = call i8 @llvm.ssub.sat.i8(i8 %a7 , i8 %b7 )
+  %r8  = call i8 @llvm.ssub.sat.i8(i8 %a8 , i8 %b8 )
+  %r9  = call i8 @llvm.ssub.sat.i8(i8 %a9 , i8 %b9 )
+  %r10 = call i8 @llvm.ssub.sat.i8(i8 %a10, i8 %b10)
+  %r11 = call i8 @llvm.ssub.sat.i8(i8 %a11, i8 %b11)
+  %r12 = call i8 @llvm.ssub.sat.i8(i8 %a12, i8 %b12)
+  %r13 = call i8 @llvm.ssub.sat.i8(i8 %a13, i8 %b13)
+  %r14 = call i8 @llvm.ssub.sat.i8(i8 %a14, i8 %b14)
+  %r15 = call i8 @llvm.ssub.sat.i8(i8 %a15, i8 %b15)
+  %r16 = call i8 @llvm.ssub.sat.i8(i8 %a16, i8 %b16)
+  %r17 = call i8 @llvm.ssub.sat.i8(i8 %a17, i8 %b17)
+  %r18 = call i8 @llvm.ssub.sat.i8(i8 %a18, i8 %b18)
+  %r19 = call i8 @llvm.ssub.sat.i8(i8 %a19, i8 %b19)
+  %r20 = call i8 @llvm.ssub.sat.i8(i8 %a20, i8 %b20)
+  %r21 = call i8 @llvm.ssub.sat.i8(i8 %a21, i8 %b21)
+  %r22 = call i8 @llvm.ssub.sat.i8(i8 %a22, i8 %b22)
+  %r23 = call i8 @llvm.ssub.sat.i8(i8 %a23, i8 %b23)
+  %r24 = call i8 @llvm.ssub.sat.i8(i8 %a24, i8 %b24)
+  %r25 = call i8 @llvm.ssub.sat.i8(i8 %a25, i8 %b25)
+  %r26 = call i8 @llvm.ssub.sat.i8(i8 %a26, i8 %b26)
+  %r27 = call i8 @llvm.ssub.sat.i8(i8 %a27, i8 %b27)
+  %r28 = call i8 @llvm.ssub.sat.i8(i8 %a28, i8 %b28)
+  %r29 = call i8 @llvm.ssub.sat.i8(i8 %a29, i8 %b29)
+  %r30 = call i8 @llvm.ssub.sat.i8(i8 %a30, i8 %b30)
+  %r31 = call i8 @llvm.ssub.sat.i8(i8 %a31, i8 %b31)
+  %r32 = call i8 @llvm.ssub.sat.i8(i8 %a32, i8 %b32)
+  %r33 = call i8 @llvm.ssub.sat.i8(i8 %a33, i8 %b33)
+  %r34 = call i8 @llvm.ssub.sat.i8(i8 %a34, i8 %b34)
+  %r35 = call i8 @llvm.ssub.sat.i8(i8 %a35, i8 %b35)
+  %r36 = call i8 @llvm.ssub.sat.i8(i8 %a36, i8 %b36)
+  %r37 = call i8 @llvm.ssub.sat.i8(i8 %a37, i8 %b37)
+  %r38 = call i8 @llvm.ssub.sat.i8(i8 %a38, i8 %b38)
+  %r39 = call i8 @llvm.ssub.sat.i8(i8 %a39, i8 %b39)
+  %r40 = call i8 @llvm.ssub.sat.i8(i8 %a40, i8 %b40)
+  %r41 = call i8 @llvm.ssub.sat.i8(i8 %a41, i8 %b41)
+  %r42 = call i8 @llvm.ssub.sat.i8(i8 %a42, i8 %b42)
+  %r43 = call i8 @llvm.ssub.sat.i8(i8 %a43, i8 %b43)
+  %r44 = call i8 @llvm.ssub.sat.i8(i8 %a44, i8 %b44)
+  %r45 = call i8 @llvm.ssub.sat.i8(i8 %a45, i8 %b45)
+  %r46 = call i8 @llvm.ssub.sat.i8(i8 %a46, i8 %b46)
+  %r47 = call i8 @llvm.ssub.sat.i8(i8 %a47, i8 %b47)
+  %r48 = call i8 @llvm.ssub.sat.i8(i8 %a48, i8 %b48)
+  %r49 = call i8 @llvm.ssub.sat.i8(i8 %a49, i8 %b49)
+  %r50 = call i8 @llvm.ssub.sat.i8(i8 %a50, i8 %b50)
+  %r51 = call i8 @llvm.ssub.sat.i8(i8 %a51, i8 %b51)
+  %r52 = call i8 @llvm.ssub.sat.i8(i8 %a52, i8 %b52)
+  %r53 = call i8 @llvm.ssub.sat.i8(i8 %a53, i8 %b53)
+  %r54 = call i8 @llvm.ssub.sat.i8(i8 %a54, i8 %b54)
+  %r55 = call i8 @llvm.ssub.sat.i8(i8 %a55, i8 %b55)
+  %r56 = call i8 @llvm.ssub.sat.i8(i8 %a56, i8 %b56)
+  %r57 = call i8 @llvm.ssub.sat.i8(i8 %a57, i8 %b57)
+  %r58 = call i8 @llvm.ssub.sat.i8(i8 %a58, i8 %b58)
+  %r59 = call i8 @llvm.ssub.sat.i8(i8 %a59, i8 %b59)
+  %r60 = call i8 @llvm.ssub.sat.i8(i8 %a60, i8 %b60)
+  %r61 = call i8 @llvm.ssub.sat.i8(i8 %a61, i8 %b61)
+  %r62 = call i8 @llvm.ssub.sat.i8(i8 %a62, i8 %b62)
+  %r63 = call i8 @llvm.ssub.sat.i8(i8 %a63, i8 %b63)
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll
new file mode 100644
index 00000000000..f21ba1f9634
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssubo.ll
@@ -0,0 +1,1254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32)
+declare {i16, i1} @llvm.ssub.with.overflow.i16(i16, i16)
+declare {i8 , i1} @llvm.ssub.with.overflow.i8 (i8 , i8 )
+
+define void @sub_v8i64() {
+; CHECK-LABEL: @sub_v8i64(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[C0:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; CHECK-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; CHECK-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; CHECK-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; CHECK-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; CHECK-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; CHECK-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %c0 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a0, i64 %b0)
+  %c1 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a1, i64 %b1)
+  %c2 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a2, i64 %b2)
+  %c3 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a3, i64 %b3)
+  %c4 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a4, i64 %b4)
+  %c5 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a5, i64 %b5)
+  %c6 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a6, i64 %b6)
+  %c7 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %a7, i64 %b7)
+  %r0 = extractvalue {i64, i1} %c0, 0
+  %r1 = extractvalue {i64, i1} %c1, 0
+  %r2 = extractvalue {i64, i1} %c2, 0
+  %r3 = extractvalue {i64, i1} %c3, 0
+  %r4 = extractvalue {i64, i1} %c4, 0
+  %r5 = extractvalue {i64, i1} %c5, 0
+  %r6 = extractvalue {i64, i1} %c6, 0
+  %r7 = extractvalue {i64, i1} %c7, 0
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sub_v16i32() {
+; CHECK-LABEL: @sub_v16i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[C0:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; CHECK-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; CHECK-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; CHECK-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; CHECK-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; CHECK-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; CHECK-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; CHECK-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; CHECK-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; CHECK-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; CHECK-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; CHECK-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; CHECK-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %c0  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a0 , i32 %b0 )
+  %c1  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a1 , i32 %b1 )
+  %c2  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a2 , i32 %b2 )
+  %c3  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a3 , i32 %b3 )
+  %c4  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a4 , i32 %b4 )
+  %c5  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a5 , i32 %b5 )
+  %c6  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a6 , i32 %b6 )
+  %c7  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a7 , i32 %b7 )
+  %c8  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a8 , i32 %b8 )
+  %c9  = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a9 , i32 %b9 )
+  %c10 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a10, i32 %b10)
+  %c11 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a11, i32 %b11)
+  %c12 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a12, i32 %b12)
+  %c13 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a13, i32 %b13)
+  %c14 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a14, i32 %b14)
+  %c15 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %a15, i32 %b15)
+  %r0  = extractvalue {i32, i1} %c0 , 0
+  %r1  = extractvalue {i32, i1} %c1 , 0
+  %r2  = extractvalue {i32, i1} %c2 , 0
+  %r3  = extractvalue {i32, i1} %c3 , 0
+  %r4  = extractvalue {i32, i1} %c4 , 0
+  %r5  = extractvalue {i32, i1} %c5 , 0
+  %r6  = extractvalue {i32, i1} %c6 , 0
+  %r7  = extractvalue {i32, i1} %c7 , 0
+  %r8  = extractvalue {i32, i1} %c8 , 0
+  %r9  = extractvalue {i32, i1} %c9 , 0
+  %r10 = extractvalue {i32, i1} %c10, 0
+  %r11 = extractvalue {i32, i1} %c11, 0
+  %r12 = extractvalue {i32, i1} %c12, 0
+  %r13 = extractvalue {i32, i1} %c13, 0
+  %r14 = extractvalue {i32, i1} %c14, 0
+  %r15 = extractvalue {i32, i1} %c15, 0
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sub_v32i16() {
+; CHECK-LABEL: @sub_v32i16(
+; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[C0:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; CHECK-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; CHECK-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; CHECK-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; CHECK-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; CHECK-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; CHECK-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; CHECK-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; CHECK-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; CHECK-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; CHECK-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; CHECK-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; CHECK-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; CHECK-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; CHECK-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; CHECK-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; CHECK-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; CHECK-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; CHECK-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; CHECK-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; CHECK-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; CHECK-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; CHECK-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; CHECK-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; CHECK-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; CHECK-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; CHECK-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; CHECK-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; CHECK-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; CHECK-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; CHECK-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; CHECK-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %c0  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a0 , i16 %b0 )
+  %c1  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a1 , i16 %b1 )
+  %c2  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a2 , i16 %b2 )
+  %c3  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a3 , i16 %b3 )
+  %c4  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a4 , i16 %b4 )
+  %c5  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a5 , i16 %b5 )
+  %c6  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a6 , i16 %b6 )
+  %c7  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a7 , i16 %b7 )
+  %c8  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a8 , i16 %b8 )
+  %c9  = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a9 , i16 %b9 )
+  %c10 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a10, i16 %b10)
+  %c11 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a11, i16 %b11)
+  %c12 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a12, i16 %b12)
+  %c13 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a13, i16 %b13)
+  %c14 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a14, i16 %b14)
+  %c15 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a15, i16 %b15)
+  %c16 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a16, i16 %b16)
+  %c17 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a17, i16 %b17)
+  %c18 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a18, i16 %b18)
+  %c19 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a19, i16 %b19)
+  %c20 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a20, i16 %b20)
+  %c21 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a21, i16 %b21)
+  %c22 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a22, i16 %b22)
+  %c23 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a23, i16 %b23)
+  %c24 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a24, i16 %b24)
+  %c25 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a25, i16 %b25)
+  %c26 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a26, i16 %b26)
+  %c27 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a27, i16 %b27)
+  %c28 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a28, i16 %b28)
+  %c29 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a29, i16 %b29)
+  %c30 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a30, i16 %b30)
+  %c31 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 %a31, i16 %b31)
+  %r0  = extractvalue {i16, i1} %c0 , 0
+  %r1  = extractvalue {i16, i1} %c1 , 0
+  %r2  = extractvalue {i16, i1} %c2 , 0
+  %r3  = extractvalue {i16, i1} %c3 , 0
+  %r4  = extractvalue {i16, i1} %c4 , 0
+  %r5  = extractvalue {i16, i1} %c5 , 0
+  %r6  = extractvalue {i16, i1} %c6 , 0
+  %r7  = extractvalue {i16, i1} %c7 , 0
+  %r8  = extractvalue {i16, i1} %c8 , 0
+  %r9  = extractvalue {i16, i1} %c9 , 0
+  %r10 = extractvalue {i16, i1} %c10, 0
+  %r11 = extractvalue {i16, i1} %c11, 0
+  %r12 = extractvalue {i16, i1} %c12, 0
+  %r13 = extractvalue {i16, i1} %c13, 0
+  %r14 = extractvalue {i16, i1} %c14, 0
+  %r15 = extractvalue {i16, i1} %c15, 0
+  %r16 = extractvalue {i16, i1} %c16, 0
+  %r17 = extractvalue {i16, i1} %c17, 0
+  %r18 = extractvalue {i16, i1} %c18, 0
+  %r19 = extractvalue {i16, i1} %c19, 0
+  %r20 = extractvalue {i16, i1} %c20, 0
+  %r21 = extractvalue {i16, i1} %c21, 0
+  %r22 = extractvalue {i16, i1} %c22, 0
+  %r23 = extractvalue {i16, i1} %c23, 0
+  %r24 = extractvalue {i16, i1} %c24, 0
+  %r25 = extractvalue {i16, i1} %c25, 0
+  %r26 = extractvalue {i16, i1} %c26, 0
+  %r27 = extractvalue {i16, i1} %c27, 0
+  %r28 = extractvalue {i16, i1} %c28, 0
+  %r29 = extractvalue {i16, i1} %c29, 0
+  %r30 = extractvalue {i16, i1} %c30, 0
+  %r31 = extractvalue {i16, i1} %c31, 0
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @sub_v64i8() {
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT:    [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[C0:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT:    [[C32:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT:    [[C33:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT:    [[C34:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT:    [[C35:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT:    [[C36:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT:    [[C37:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT:    [[C38:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT:    [[C39:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT:    [[C40:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT:    [[C41:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT:    [[C42:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT:    [[C43:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT:    [[C44:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT:    [[C45:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT:    [[C46:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT:    [[C47:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT:    [[C48:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT:    [[C49:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT:    [[C50:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT:    [[C51:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT:    [[C52:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT:    [[C53:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT:    [[C54:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT:    [[C55:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT:    [[C56:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT:    [[C57:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT:    [[C58:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT:    [[C59:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT:    [[C60:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT:    [[C61:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT:    [[C62:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT:    [[C63:%.*]] = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT:    [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT:    [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT:    [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT:    [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT:    [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT:    [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT:    [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT:    [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT:    [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT:    [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT:    [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT:    [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT:    [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT:    [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT:    [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT:    [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT:    [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT:    [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT:    [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT:    [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT:    [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT:    [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT:    [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT:    [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT:    [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT:    [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT:    [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT:    [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT:    [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT:    [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT:    [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT:    [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT:    store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1
+; CHECK-NEXT:    store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1
+; CHECK-NEXT:    store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1
+; CHECK-NEXT:    store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+; CHECK-NEXT:    store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+; CHECK-NEXT:    store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+; CHECK-NEXT:    store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+; CHECK-NEXT:    store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+; CHECK-NEXT:    store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+; CHECK-NEXT:    store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+; CHECK-NEXT:    store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+; CHECK-NEXT:    store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+; CHECK-NEXT:    store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+; CHECK-NEXT:    store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+; CHECK-NEXT:    store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+; CHECK-NEXT:    store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+; CHECK-NEXT:    store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+; CHECK-NEXT:    store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+; CHECK-NEXT:    store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+; CHECK-NEXT:    store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+; CHECK-NEXT:    store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+; CHECK-NEXT:    store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+; CHECK-NEXT:    store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+; CHECK-NEXT:    store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+; CHECK-NEXT:    store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+; CHECK-NEXT:    store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+; CHECK-NEXT:    store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+; CHECK-NEXT:    store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+; CHECK-NEXT:    store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+; CHECK-NEXT:    store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+; CHECK-NEXT:    store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+; CHECK-NEXT:    store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+; CHECK-NEXT:    store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+; CHECK-NEXT:    store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+; CHECK-NEXT:    store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+; CHECK-NEXT:    store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+; CHECK-NEXT:    store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+; CHECK-NEXT:    store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+; CHECK-NEXT:    store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+; CHECK-NEXT:    store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+; CHECK-NEXT:    store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+; CHECK-NEXT:    store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+; CHECK-NEXT:    store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+; CHECK-NEXT:    store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+; CHECK-NEXT:    store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+; CHECK-NEXT:    store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+; CHECK-NEXT:    store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+; CHECK-NEXT:    store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+; CHECK-NEXT:    store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+; CHECK-NEXT:    store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+; CHECK-NEXT:    store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+; CHECK-NEXT:    store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+; CHECK-NEXT:    store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+; CHECK-NEXT:    store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+; CHECK-NEXT:    store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+; CHECK-NEXT:    store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+; CHECK-NEXT:    store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %c0  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a0 , i8 %b0 )
+  %c1  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a1 , i8 %b1 )
+  %c2  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a2 , i8 %b2 )
+  %c3  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a3 , i8 %b3 )
+  %c4  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a4 , i8 %b4 )
+  %c5  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a5 , i8 %b5 )
+  %c6  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a6 , i8 %b6 )
+  %c7  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a7 , i8 %b7 )
+  %c8  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a8 , i8 %b8 )
+  %c9  = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a9 , i8 %b9 )
+  %c10 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a10, i8 %b10)
+  %c11 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a11, i8 %b11)
+  %c12 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a12, i8 %b12)
+  %c13 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a13, i8 %b13)
+  %c14 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a14, i8 %b14)
+  %c15 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a15, i8 %b15)
+  %c16 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a16, i8 %b16)
+  %c17 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a17, i8 %b17)
+  %c18 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a18, i8 %b18)
+  %c19 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a19, i8 %b19)
+  %c20 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a20, i8 %b20)
+  %c21 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a21, i8 %b21)
+  %c22 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a22, i8 %b22)
+  %c23 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a23, i8 %b23)
+  %c24 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a24, i8 %b24)
+  %c25 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a25, i8 %b25)
+  %c26 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a26, i8 %b26)
+  %c27 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a27, i8 %b27)
+  %c28 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a28, i8 %b28)
+  %c29 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a29, i8 %b29)
+  %c30 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a30, i8 %b30)
+  %c31 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a31, i8 %b31)
+  %c32 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a32, i8 %b32)
+  %c33 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a33, i8 %b33)
+  %c34 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a34, i8 %b34)
+  %c35 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a35, i8 %b35)
+  %c36 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a36, i8 %b36)
+  %c37 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a37, i8 %b37)
+  %c38 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a38, i8 %b38)
+  %c39 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a39, i8 %b39)
+  %c40 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a40, i8 %b40)
+  %c41 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a41, i8 %b41)
+  %c42 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a42, i8 %b42)
+  %c43 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a43, i8 %b43)
+  %c44 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a44, i8 %b44)
+  %c45 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a45, i8 %b45)
+  %c46 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a46, i8 %b46)
+  %c47 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a47, i8 %b47)
+  %c48 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a48, i8 %b48)
+  %c49 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a49, i8 %b49)
+  %c50 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a50, i8 %b50)
+  %c51 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a51, i8 %b51)
+  %c52 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a52, i8 %b52)
+  %c53 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a53, i8 %b53)
+  %c54 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a54, i8 %b54)
+  %c55 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a55, i8 %b55)
+  %c56 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a56, i8 %b56)
+  %c57 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a57, i8 %b57)
+  %c58 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a58, i8 %b58)
+  %c59 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a59, i8 %b59)
+  %c60 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a60, i8 %b60)
+  %c61 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a61, i8 %b61)
+  %c62 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a62, i8 %b62)
+  %c63 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a63, i8 %b63)
+  %r0  = extractvalue {i8, i1} %c0 , 0
+  %r1  = extractvalue {i8, i1} %c1 , 0
+  %r2  = extractvalue {i8, i1} %c2 , 0
+  %r3  = extractvalue {i8, i1} %c3 , 0
+  %r4  = extractvalue {i8, i1} %c4 , 0
+  %r5  = extractvalue {i8, i1} %c5 , 0
+  %r6  = extractvalue {i8, i1} %c6 , 0
+  %r7  = extractvalue {i8, i1} %c7 , 0
+  %r8  = extractvalue {i8, i1} %c8 , 0
+  %r9  = extractvalue {i8, i1} %c9 , 0
+  %r10 = extractvalue {i8, i1} %c10, 0
+  %r11 = extractvalue {i8, i1} %c11, 0
+  %r12 = extractvalue {i8, i1} %c12, 0
+  %r13 = extractvalue {i8, i1} %c13, 0
+  %r14 = extractvalue {i8, i1} %c14, 0
+  %r15 = extractvalue {i8, i1} %c15, 0
+  %r16 = extractvalue {i8, i1} %c16, 0
+  %r17 = extractvalue {i8, i1} %c17, 0
+  %r18 = extractvalue {i8, i1} %c18, 0
+  %r19 = extractvalue {i8, i1} %c19, 0
+  %r20 = extractvalue {i8, i1} %c20, 0
+  %r21 = extractvalue {i8, i1} %c21, 0
+  %r22 = extractvalue {i8, i1} %c22, 0
+  %r23 = extractvalue {i8, i1} %c23, 0
+  %r24 = extractvalue {i8, i1} %c24, 0
+  %r25 = extractvalue {i8, i1} %c25, 0
+  %r26 = extractvalue {i8, i1} %c26, 0
+  %r27 = extractvalue {i8, i1} %c27, 0
+  %r28 = extractvalue {i8, i1} %c28, 0
+  %r29 = extractvalue {i8, i1} %c29, 0
+  %r30 = extractvalue {i8, i1} %c30, 0
+  %r31 = extractvalue {i8, i1} %c31, 0
+  %r32 = extractvalue {i8, i1} %c32, 0
+  %r33 = extractvalue {i8, i1} %c33, 0
+  %r34 = extractvalue {i8, i1} %c34, 0
+  %r35 = extractvalue {i8, i1} %c35, 0
+  %r36 = extractvalue {i8, i1} %c36, 0
+  %r37 = extractvalue {i8, i1} %c37, 0
+  %r38 = extractvalue {i8, i1} %c38, 0
+  %r39 = extractvalue {i8, i1} %c39, 0
+  %r40 = extractvalue {i8, i1} %c40, 0
+  %r41 = extractvalue {i8, i1} %c41, 0
+  %r42 = extractvalue {i8, i1} %c42, 0
+  %r43 = extractvalue {i8, i1} %c43, 0
+  %r44 = extractvalue {i8, i1} %c44, 0
+  %r45 = extractvalue {i8, i1} %c45, 0
+  %r46 = extractvalue {i8, i1} %c46, 0
+  %r47 = extractvalue {i8, i1} %c47, 0
+  %r48 = extractvalue {i8, i1} %c48, 0
+  %r49 = extractvalue {i8, i1} %c49, 0
+  %r50 = extractvalue {i8, i1} %c50, 0
+  %r51 = extractvalue {i8, i1} %c51, 0
+  %r52 = extractvalue {i8, i1} %c52, 0
+  %r53 = extractvalue {i8, i1} %c53, 0
+  %r54 = extractvalue {i8, i1} %c54, 0
+  %r55 = extractvalue {i8, i1} %c55, 0
+  %r56 = extractvalue {i8, i1} %c56, 0
+  %r57 = extractvalue {i8, i1} %c57, 0
+  %r58 = extractvalue {i8, i1} %c58, 0
+  %r59 = extractvalue {i8, i1} %c59, 0
+  %r60 = extractvalue {i8, i1} %c60, 0
+  %r61 = extractvalue {i8, i1} %c61, 0
+  %r62 = extractvalue {i8, i1} %c62, 0
+  %r63 = extractvalue {i8, i1} %c63, 0
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
new file mode 100644
index 00000000000..ec554ff4fb5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@@ -0,0 +1,729 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare i64 @llvm.usub.sat.i64(i64, i64)
+declare i32 @llvm.usub.sat.i32(i32, i32)
+declare i16 @llvm.usub.sat.i16(i16, i16)
+declare i8  @llvm.usub.sat.i8 (i8 , i8 )
+
+define void @sub_v8i64() {
+; SSE-LABEL: @sub_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
+; SSE-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
+; SSE-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
+; SSE-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
+; SSE-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
+; SSE-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
+; SSE-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
+; SSE-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]])
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP2]])
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = call i64 @llvm.usub.sat.i64(i64 %a0, i64 %b0)
+  %r1 = call i64 @llvm.usub.sat.i64(i64 %a1, i64 %b1)
+  %r2 = call i64 @llvm.usub.sat.i64(i64 %a2, i64 %b2)
+  %r3 = call i64 @llvm.usub.sat.i64(i64 %a3, i64 %b3)
+  %r4 = call i64 @llvm.usub.sat.i64(i64 %a4, i64 %b4)
+  %r5 = call i64 @llvm.usub.sat.i64(i64 %a5, i64 %b5)
+  %r6 = call i64 @llvm.usub.sat.i64(i64 %a6, i64 %b6)
+  %r7 = call i64 @llvm.usub.sat.i64(i64 %a7, i64 %b7)
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sub_v16i32() {
+; SSE-LABEL: @sub_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]])
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]])
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]])
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = call i32 @llvm.usub.sat.i32(i32 %a0 , i32 %b0 )
+  %r1  = call i32 @llvm.usub.sat.i32(i32 %a1 , i32 %b1 )
+  %r2  = call i32 @llvm.usub.sat.i32(i32 %a2 , i32 %b2 )
+  %r3  = call i32 @llvm.usub.sat.i32(i32 %a3 , i32 %b3 )
+  %r4  = call i32 @llvm.usub.sat.i32(i32 %a4 , i32 %b4 )
+  %r5  = call i32 @llvm.usub.sat.i32(i32 %a5 , i32 %b5 )
+  %r6  = call i32 @llvm.usub.sat.i32(i32 %a6 , i32 %b6 )
+  %r7  = call i32 @llvm.usub.sat.i32(i32 %a7 , i32 %b7 )
+  %r8  = call i32 @llvm.usub.sat.i32(i32 %a8 , i32 %b8 )
+  %r9  = call i32 @llvm.usub.sat.i32(i32 %a9 , i32 %b9 )
+  %r10 = call i32 @llvm.usub.sat.i32(i32 %a10, i32 %b10)
+  %r11 = call i32 @llvm.usub.sat.i32(i32 %a11, i32 %b11)
+  %r12 = call i32 @llvm.usub.sat.i32(i32 %a12, i32 %b12)
+  %r13 = call i32 @llvm.usub.sat.i32(i32 %a13, i32 %b13)
+  %r14 = call i32 @llvm.usub.sat.i32(i32 %a14, i32 %b14)
+  %r15 = call i32 @llvm.usub.sat.i32(i32 %a15, i32 %b15)
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sub_v32i16() {
+; SSE-LABEL: @sub_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]])
+; SLM-NEXT:    [[TMP10:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]])
+; SLM-NEXT:    [[TMP11:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]])
+; SLM-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]])
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]])
+; AVX512-NEXT:    [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]])
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = call i16 @llvm.usub.sat.i16(i16 %a0 , i16 %b0 )
+  %r1  = call i16 @llvm.usub.sat.i16(i16 %a1 , i16 %b1 )
+  %r2  = call i16 @llvm.usub.sat.i16(i16 %a2 , i16 %b2 )
+  %r3  = call i16 @llvm.usub.sat.i16(i16 %a3 , i16 %b3 )
+  %r4  = call i16 @llvm.usub.sat.i16(i16 %a4 , i16 %b4 )
+  %r5  = call i16 @llvm.usub.sat.i16(i16 %a5 , i16 %b5 )
+  %r6  = call i16 @llvm.usub.sat.i16(i16 %a6 , i16 %b6 )
+  %r7  = call i16 @llvm.usub.sat.i16(i16 %a7 , i16 %b7 )
+  %r8  = call i16 @llvm.usub.sat.i16(i16 %a8 , i16 %b8 )
+  %r9  = call i16 @llvm.usub.sat.i16(i16 %a9 , i16 %b9 )
+  %r10 = call i16 @llvm.usub.sat.i16(i16 %a10, i16 %b10)
+  %r11 = call i16 @llvm.usub.sat.i16(i16 %a11, i16 %b11)
+  %r12 = call i16 @llvm.usub.sat.i16(i16 %a12, i16 %b12)
+  %r13 = call i16 @llvm.usub.sat.i16(i16 %a13, i16 %b13)
+  %r14 = call i16 @llvm.usub.sat.i16(i16 %a14, i16 %b14)
+  %r15 = call i16 @llvm.usub.sat.i16(i16 %a15, i16 %b15)
+  %r16 = call i16 @llvm.usub.sat.i16(i16 %a16, i16 %b16)
+  %r17 = call i16 @llvm.usub.sat.i16(i16 %a17, i16 %b17)
+  %r18 = call i16 @llvm.usub.sat.i16(i16 %a18, i16 %b18)
+  %r19 = call i16 @llvm.usub.sat.i16(i16 %a19, i16 %b19)
+  %r20 = call i16 @llvm.usub.sat.i16(i16 %a20, i16 %b20)
+  %r21 = call i16 @llvm.usub.sat.i16(i16 %a21, i16 %b21)
+  %r22 = call i16 @llvm.usub.sat.i16(i16 %a22, i16 %b22)
+  %r23 = call i16 @llvm.usub.sat.i16(i16 %a23, i16 %b23)
+  %r24 = call i16 @llvm.usub.sat.i16(i16 %a24, i16 %b24)
+  %r25 = call i16 @llvm.usub.sat.i16(i16 %a25, i16 %b25)
+  %r26 = call i16 @llvm.usub.sat.i16(i16 %a26, i16 %b26)
+  %r27 = call i16 @llvm.usub.sat.i16(i16 %a27, i16 %b27)
+  %r28 = call i16 @llvm.usub.sat.i16(i16 %a28, i16 %b28)
+  %r29 = call i16 @llvm.usub.sat.i16(i16 %a29, i16 %b29)
+  %r30 = call i16 @llvm.usub.sat.i16(i16 %a30, i16 %b30)
+  %r31 = call i16 @llvm.usub.sat.i16(i16 %a31, i16 %b31)
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @sub_v64i8() {
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]])
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = call i8 @llvm.usub.sat.i8(i8 %a0 , i8 %b0 )
+  %r1  = call i8 @llvm.usub.sat.i8(i8 %a1 , i8 %b1 )
+  %r2  = call i8 @llvm.usub.sat.i8(i8 %a2 , i8 %b2 )
+  %r3  = call i8 @llvm.usub.sat.i8(i8 %a3 , i8 %b3 )
+  %r4  = call i8 @llvm.usub.sat.i8(i8 %a4 , i8 %b4 )
+  %r5  = call i8 @llvm.usub.sat.i8(i8 %a5 , i8 %b5 )
+  %r6  = call i8 @llvm.usub.sat.i8(i8 %a6 , i8 %b6 )
+  %r7  = call i8 @llvm.usub.sat.i8(i8 %a7 , i8 %b7 )
+  %r8  = call i8 @llvm.usub.sat.i8(i8 %a8 , i8 %b8 )
+  %r9  = call i8 @llvm.usub.sat.i8(i8 %a9 , i8 %b9 )
+  %r10 = call i8 @llvm.usub.sat.i8(i8 %a10, i8 %b10)
+  %r11 = call i8 @llvm.usub.sat.i8(i8 %a11, i8 %b11)
+  %r12 = call i8 @llvm.usub.sat.i8(i8 %a12, i8 %b12)
+  %r13 = call i8 @llvm.usub.sat.i8(i8 %a13, i8 %b13)
+  %r14 = call i8 @llvm.usub.sat.i8(i8 %a14, i8 %b14)
+  %r15 = call i8 @llvm.usub.sat.i8(i8 %a15, i8 %b15)
+  %r16 = call i8 @llvm.usub.sat.i8(i8 %a16, i8 %b16)
+  %r17 = call i8 @llvm.usub.sat.i8(i8 %a17, i8 %b17)
+  %r18 = call i8 @llvm.usub.sat.i8(i8 %a18, i8 %b18)
+  %r19 = call i8 @llvm.usub.sat.i8(i8 %a19, i8 %b19)
+  %r20 = call i8 @llvm.usub.sat.i8(i8 %a20, i8 %b20)
+  %r21 = call i8 @llvm.usub.sat.i8(i8 %a21, i8 %b21)
+  %r22 = call i8 @llvm.usub.sat.i8(i8 %a22, i8 %b22)
+  %r23 = call i8 @llvm.usub.sat.i8(i8 %a23, i8 %b23)
+  %r24 = call i8 @llvm.usub.sat.i8(i8 %a24, i8 %b24)
+  %r25 = call i8 @llvm.usub.sat.i8(i8 %a25, i8 %b25)
+  %r26 = call i8 @llvm.usub.sat.i8(i8 %a26, i8 %b26)
+  %r27 = call i8 @llvm.usub.sat.i8(i8 %a27, i8 %b27)
+  %r28 = call i8 @llvm.usub.sat.i8(i8 %a28, i8 %b28)
+  %r29 = call i8 @llvm.usub.sat.i8(i8 %a29, i8 %b29)
+  %r30 = call i8 @llvm.usub.sat.i8(i8 %a30, i8 %b30)
+  %r31 = call i8 @llvm.usub.sat.i8(i8 %a31, i8 %b31)
+  %r32 = call i8 @llvm.usub.sat.i8(i8 %a32, i8 %b32)
+  %r33 = call i8 @llvm.usub.sat.i8(i8 %a33, i8 %b33)
+  %r34 = call i8 @llvm.usub.sat.i8(i8 %a34, i8 %b34)
+  %r35 = call i8 @llvm.usub.sat.i8(i8 %a35, i8 %b35)
+  %r36 = call i8 @llvm.usub.sat.i8(i8 %a36, i8 %b36)
+  %r37 = call i8 @llvm.usub.sat.i8(i8 %a37, i8 %b37)
+  %r38 = call i8 @llvm.usub.sat.i8(i8 %a38, i8 %b38)
+  %r39 = call i8 @llvm.usub.sat.i8(i8 %a39, i8 %b39)
+  %r40 = call i8 @llvm.usub.sat.i8(i8 %a40, i8 %b40)
+  %r41 = call i8 @llvm.usub.sat.i8(i8 %a41, i8 %b41)
+  %r42 = call i8 @llvm.usub.sat.i8(i8 %a42, i8 %b42)
+  %r43 = call i8 @llvm.usub.sat.i8(i8 %a43, i8 %b43)
+  %r44 = call i8 @llvm.usub.sat.i8(i8 %a44, i8 %b44)
+  %r45 = call i8 @llvm.usub.sat.i8(i8 %a45, i8 %b45)
+  %r46 = call i8 @llvm.usub.sat.i8(i8 %a46, i8 %b46)
+  %r47 = call i8 @llvm.usub.sat.i8(i8 %a47, i8 %b47)
+  %r48 = call i8 @llvm.usub.sat.i8(i8 %a48, i8 %b48)
+  %r49 = call i8 @llvm.usub.sat.i8(i8 %a49, i8 %b49)
+  %r50 = call i8 @llvm.usub.sat.i8(i8 %a50, i8 %b50)
+  %r51 = call i8 @llvm.usub.sat.i8(i8 %a51, i8 %b51)
+  %r52 = call i8 @llvm.usub.sat.i8(i8 %a52, i8 %b52)
+  %r53 = call i8 @llvm.usub.sat.i8(i8 %a53, i8 %b53)
+  %r54 = call i8 @llvm.usub.sat.i8(i8 %a54, i8 %b54)
+  %r55 = call i8 @llvm.usub.sat.i8(i8 %a55, i8 %b55)
+  %r56 = call i8 @llvm.usub.sat.i8(i8 %a56, i8 %b56)
+  %r57 = call i8 @llvm.usub.sat.i8(i8 %a57, i8 %b57)
+  %r58 = call i8 @llvm.usub.sat.i8(i8 %a58, i8 %b58)
+  %r59 = call i8 @llvm.usub.sat.i8(i8 %a59, i8 %b59)
+  %r60 = call i8 @llvm.usub.sat.i8(i8 %a60, i8 %b60)
+  %r61 = call i8 @llvm.usub.sat.i8(i8 %a61, i8 %b61)
+  %r62 = call i8 @llvm.usub.sat.i8(i8 %a62, i8 %b62)
+  %r63 = call i8 @llvm.usub.sat.i8(i8 %a63, i8 %b63)
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll
new file mode 100644
index 00000000000..5edbcd93fcc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usubo.ll
@@ -0,0 +1,1254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256BW
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16)
+declare {i8 , i1} @llvm.usub.with.overflow.i8 (i8 , i8 )
+
+define void @sub_v8i64() {
+; CHECK-LABEL: @sub_v8i64(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[C0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A0]], i64 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A1]], i64 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A2]], i64 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A3]], i64 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A4]], i64 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A5]], i64 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A6]], i64 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[A7]], i64 [[B7]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i64, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i64, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i64, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i64, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i64, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i64, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i64, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i64, i1 } [[C7]], 0
+; CHECK-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; CHECK-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; CHECK-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; CHECK-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; CHECK-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; CHECK-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; CHECK-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %c0 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a0, i64 %b0)
+  %c1 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a1, i64 %b1)
+  %c2 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a2, i64 %b2)
+  %c3 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a3, i64 %b3)
+  %c4 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a4, i64 %b4)
+  %c5 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a5, i64 %b5)
+  %c6 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a6, i64 %b6)
+  %c7 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a7, i64 %b7)
+  %r0 = extractvalue {i64, i1} %c0, 0
+  %r1 = extractvalue {i64, i1} %c1, 0
+  %r2 = extractvalue {i64, i1} %c2, 0
+  %r3 = extractvalue {i64, i1} %c3, 0
+  %r4 = extractvalue {i64, i1} %c4, 0
+  %r5 = extractvalue {i64, i1} %c5, 0
+  %r6 = extractvalue {i64, i1} %c6, 0
+  %r7 = extractvalue {i64, i1} %c7, 0
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sub_v16i32() {
+; CHECK-LABEL: @sub_v16i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; CHECK-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; CHECK-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; CHECK-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; CHECK-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; CHECK-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; CHECK-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; CHECK-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; CHECK-NEXT:    [[C0:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A2]], i32 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A3]], i32 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A4]], i32 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A5]], i32 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A6]], i32 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A7]], i32 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A8]], i32 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A9]], i32 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A10]], i32 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A11]], i32 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A12]], i32 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A13]], i32 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A14]], i32 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A15]], i32 [[B15]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i32, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i32, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i32, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i32, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i32, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i32, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i32, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i32, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i32, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i32, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i32, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i32, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i32, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i32, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i32, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i32, i1 } [[C15]], 0
+; CHECK-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; CHECK-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; CHECK-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; CHECK-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; CHECK-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; CHECK-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; CHECK-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; CHECK-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; CHECK-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; CHECK-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; CHECK-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; CHECK-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; CHECK-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %c0  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a0 , i32 %b0 )
+  %c1  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a1 , i32 %b1 )
+  %c2  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a2 , i32 %b2 )
+  %c3  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a3 , i32 %b3 )
+  %c4  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a4 , i32 %b4 )
+  %c5  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a5 , i32 %b5 )
+  %c6  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a6 , i32 %b6 )
+  %c7  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a7 , i32 %b7 )
+  %c8  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a8 , i32 %b8 )
+  %c9  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a9 , i32 %b9 )
+  %c10 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a10, i32 %b10)
+  %c11 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a11, i32 %b11)
+  %c12 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a12, i32 %b12)
+  %c13 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a13, i32 %b13)
+  %c14 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a14, i32 %b14)
+  %c15 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a15, i32 %b15)
+  %r0  = extractvalue {i32, i1} %c0 , 0
+  %r1  = extractvalue {i32, i1} %c1 , 0
+  %r2  = extractvalue {i32, i1} %c2 , 0
+  %r3  = extractvalue {i32, i1} %c3 , 0
+  %r4  = extractvalue {i32, i1} %c4 , 0
+  %r5  = extractvalue {i32, i1} %c5 , 0
+  %r6  = extractvalue {i32, i1} %c6 , 0
+  %r7  = extractvalue {i32, i1} %c7 , 0
+  %r8  = extractvalue {i32, i1} %c8 , 0
+  %r9  = extractvalue {i32, i1} %c9 , 0
+  %r10 = extractvalue {i32, i1} %c10, 0
+  %r11 = extractvalue {i32, i1} %c11, 0
+  %r12 = extractvalue {i32, i1} %c12, 0
+  %r13 = extractvalue {i32, i1} %c13, 0
+  %r14 = extractvalue {i32, i1} %c14, 0
+  %r15 = extractvalue {i32, i1} %c15, 0
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sub_v32i16() {
+; CHECK-LABEL: @sub_v32i16(
+; CHECK-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; CHECK-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; CHECK-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; CHECK-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; CHECK-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; CHECK-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; CHECK-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; CHECK-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; CHECK-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; CHECK-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; CHECK-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; CHECK-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; CHECK-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; CHECK-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; CHECK-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; CHECK-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; CHECK-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; CHECK-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; CHECK-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; CHECK-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; CHECK-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; CHECK-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; CHECK-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; CHECK-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; CHECK-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; CHECK-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; CHECK-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; CHECK-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; CHECK-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; CHECK-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; CHECK-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; CHECK-NEXT:    [[C0:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A0]], i16 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A1]], i16 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A2]], i16 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A3]], i16 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A4]], i16 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A5]], i16 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A6]], i16 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A7]], i16 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A8]], i16 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A9]], i16 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A10]], i16 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A11]], i16 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A12]], i16 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A13]], i16 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A14]], i16 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A15]], i16 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A16]], i16 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A17]], i16 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A18]], i16 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A19]], i16 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A20]], i16 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A21]], i16 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A22]], i16 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A23]], i16 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A24]], i16 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A25]], i16 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A26]], i16 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A27]], i16 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A28]], i16 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A29]], i16 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A30]], i16 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 [[A31]], i16 [[B31]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i16, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i16, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i16, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i16, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i16, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i16, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i16, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i16, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i16, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i16, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i16, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i16, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i16, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i16, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i16, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i16, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i16, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i16, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i16, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i16, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i16, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i16, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i16, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i16, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i16, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i16, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i16, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i16, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i16, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i16, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i16, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i16, i1 } [[C31]], 0
+; CHECK-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; CHECK-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; CHECK-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; CHECK-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; CHECK-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; CHECK-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; CHECK-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; CHECK-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; CHECK-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; CHECK-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; CHECK-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; CHECK-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; CHECK-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; CHECK-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; CHECK-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; CHECK-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; CHECK-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; CHECK-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; CHECK-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; CHECK-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; CHECK-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; CHECK-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; CHECK-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; CHECK-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; CHECK-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; CHECK-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; CHECK-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; CHECK-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; CHECK-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; CHECK-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; CHECK-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; CHECK-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %c0  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a0 , i16 %b0 )
+  %c1  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a1 , i16 %b1 )
+  %c2  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a2 , i16 %b2 )
+  %c3  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a3 , i16 %b3 )
+  %c4  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a4 , i16 %b4 )
+  %c5  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a5 , i16 %b5 )
+  %c6  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a6 , i16 %b6 )
+  %c7  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a7 , i16 %b7 )
+  %c8  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a8 , i16 %b8 )
+  %c9  = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a9 , i16 %b9 )
+  %c10 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a10, i16 %b10)
+  %c11 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a11, i16 %b11)
+  %c12 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a12, i16 %b12)
+  %c13 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a13, i16 %b13)
+  %c14 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a14, i16 %b14)
+  %c15 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a15, i16 %b15)
+  %c16 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a16, i16 %b16)
+  %c17 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a17, i16 %b17)
+  %c18 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a18, i16 %b18)
+  %c19 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a19, i16 %b19)
+  %c20 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a20, i16 %b20)
+  %c21 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a21, i16 %b21)
+  %c22 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a22, i16 %b22)
+  %c23 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a23, i16 %b23)
+  %c24 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a24, i16 %b24)
+  %c25 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a25, i16 %b25)
+  %c26 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a26, i16 %b26)
+  %c27 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a27, i16 %b27)
+  %c28 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a28, i16 %b28)
+  %c29 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a29, i16 %b29)
+  %c30 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a30, i16 %b30)
+  %c31 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %a31, i16 %b31)
+  %r0  = extractvalue {i16, i1} %c0 , 0
+  %r1  = extractvalue {i16, i1} %c1 , 0
+  %r2  = extractvalue {i16, i1} %c2 , 0
+  %r3  = extractvalue {i16, i1} %c3 , 0
+  %r4  = extractvalue {i16, i1} %c4 , 0
+  %r5  = extractvalue {i16, i1} %c5 , 0
+  %r6  = extractvalue {i16, i1} %c6 , 0
+  %r7  = extractvalue {i16, i1} %c7 , 0
+  %r8  = extractvalue {i16, i1} %c8 , 0
+  %r9  = extractvalue {i16, i1} %c9 , 0
+  %r10 = extractvalue {i16, i1} %c10, 0
+  %r11 = extractvalue {i16, i1} %c11, 0
+  %r12 = extractvalue {i16, i1} %c12, 0
+  %r13 = extractvalue {i16, i1} %c13, 0
+  %r14 = extractvalue {i16, i1} %c14, 0
+  %r15 = extractvalue {i16, i1} %c15, 0
+  %r16 = extractvalue {i16, i1} %c16, 0
+  %r17 = extractvalue {i16, i1} %c17, 0
+  %r18 = extractvalue {i16, i1} %c18, 0
+  %r19 = extractvalue {i16, i1} %c19, 0
+  %r20 = extractvalue {i16, i1} %c20, 0
+  %r21 = extractvalue {i16, i1} %c21, 0
+  %r22 = extractvalue {i16, i1} %c22, 0
+  %r23 = extractvalue {i16, i1} %c23, 0
+  %r24 = extractvalue {i16, i1} %c24, 0
+  %r25 = extractvalue {i16, i1} %c25, 0
+  %r26 = extractvalue {i16, i1} %c26, 0
+  %r27 = extractvalue {i16, i1} %c27, 0
+  %r28 = extractvalue {i16, i1} %c28, 0
+  %r29 = extractvalue {i16, i1} %c29, 0
+  %r30 = extractvalue {i16, i1} %c30, 0
+  %r31 = extractvalue {i16, i1} %c31, 0
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @sub_v64i8() {
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT:    [[A0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[A1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[A2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[A3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[A4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[A5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[A6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[A7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[A8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[A9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[A10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[A11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[A12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[A13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[A14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[A15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[A16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[A17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[A18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[A19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[A20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[A21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[A22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[A23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[A24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[A25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[A26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[A27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[A28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[A29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[A30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[A31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[A32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[A33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[A34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[A35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[A36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[A37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[A38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[A39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[A40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[A41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[A42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[A43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[A44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[A45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[A46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[A47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[A48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[A49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[A50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[A51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[A52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[A53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[A54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[A55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[A56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[A57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[A58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[A59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[A60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[A61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[A62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[A63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[B0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0), align 1
+; CHECK-NEXT:    [[B1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[B2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2), align 1
+; CHECK-NEXT:    [[B3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3), align 1
+; CHECK-NEXT:    [[B4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4), align 1
+; CHECK-NEXT:    [[B5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5), align 1
+; CHECK-NEXT:    [[B6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6), align 1
+; CHECK-NEXT:    [[B7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7), align 1
+; CHECK-NEXT:    [[B8:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8), align 1
+; CHECK-NEXT:    [[B9:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9), align 1
+; CHECK-NEXT:    [[B10:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+; CHECK-NEXT:    [[B11:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+; CHECK-NEXT:    [[B12:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+; CHECK-NEXT:    [[B13:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+; CHECK-NEXT:    [[B14:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+; CHECK-NEXT:    [[B15:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+; CHECK-NEXT:    [[B16:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+; CHECK-NEXT:    [[B17:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+; CHECK-NEXT:    [[B18:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+; CHECK-NEXT:    [[B19:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+; CHECK-NEXT:    [[B20:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+; CHECK-NEXT:    [[B21:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+; CHECK-NEXT:    [[B22:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+; CHECK-NEXT:    [[B23:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+; CHECK-NEXT:    [[B24:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+; CHECK-NEXT:    [[B25:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+; CHECK-NEXT:    [[B26:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+; CHECK-NEXT:    [[B27:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+; CHECK-NEXT:    [[B28:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+; CHECK-NEXT:    [[B29:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+; CHECK-NEXT:    [[B30:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+; CHECK-NEXT:    [[B31:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+; CHECK-NEXT:    [[B32:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+; CHECK-NEXT:    [[B33:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+; CHECK-NEXT:    [[B34:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+; CHECK-NEXT:    [[B35:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+; CHECK-NEXT:    [[B36:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+; CHECK-NEXT:    [[B37:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+; CHECK-NEXT:    [[B38:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+; CHECK-NEXT:    [[B39:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+; CHECK-NEXT:    [[B40:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+; CHECK-NEXT:    [[B41:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+; CHECK-NEXT:    [[B42:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+; CHECK-NEXT:    [[B43:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+; CHECK-NEXT:    [[B44:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+; CHECK-NEXT:    [[B45:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+; CHECK-NEXT:    [[B46:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+; CHECK-NEXT:    [[B47:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+; CHECK-NEXT:    [[B48:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+; CHECK-NEXT:    [[B49:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+; CHECK-NEXT:    [[B50:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+; CHECK-NEXT:    [[B51:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+; CHECK-NEXT:    [[B52:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+; CHECK-NEXT:    [[B53:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+; CHECK-NEXT:    [[B54:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+; CHECK-NEXT:    [[B55:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+; CHECK-NEXT:    [[B56:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+; CHECK-NEXT:    [[B57:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+; CHECK-NEXT:    [[B58:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+; CHECK-NEXT:    [[B59:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+; CHECK-NEXT:    [[B60:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+; CHECK-NEXT:    [[B61:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+; CHECK-NEXT:    [[B62:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+; CHECK-NEXT:    [[B63:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+; CHECK-NEXT:    [[C0:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A0]], i8 [[B0]])
+; CHECK-NEXT:    [[C1:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A1]], i8 [[B1]])
+; CHECK-NEXT:    [[C2:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A2]], i8 [[B2]])
+; CHECK-NEXT:    [[C3:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A3]], i8 [[B3]])
+; CHECK-NEXT:    [[C4:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A4]], i8 [[B4]])
+; CHECK-NEXT:    [[C5:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A5]], i8 [[B5]])
+; CHECK-NEXT:    [[C6:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A6]], i8 [[B6]])
+; CHECK-NEXT:    [[C7:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A7]], i8 [[B7]])
+; CHECK-NEXT:    [[C8:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A8]], i8 [[B8]])
+; CHECK-NEXT:    [[C9:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A9]], i8 [[B9]])
+; CHECK-NEXT:    [[C10:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A10]], i8 [[B10]])
+; CHECK-NEXT:    [[C11:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A11]], i8 [[B11]])
+; CHECK-NEXT:    [[C12:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A12]], i8 [[B12]])
+; CHECK-NEXT:    [[C13:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A13]], i8 [[B13]])
+; CHECK-NEXT:    [[C14:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A14]], i8 [[B14]])
+; CHECK-NEXT:    [[C15:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A15]], i8 [[B15]])
+; CHECK-NEXT:    [[C16:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A16]], i8 [[B16]])
+; CHECK-NEXT:    [[C17:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A17]], i8 [[B17]])
+; CHECK-NEXT:    [[C18:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A18]], i8 [[B18]])
+; CHECK-NEXT:    [[C19:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A19]], i8 [[B19]])
+; CHECK-NEXT:    [[C20:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A20]], i8 [[B20]])
+; CHECK-NEXT:    [[C21:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A21]], i8 [[B21]])
+; CHECK-NEXT:    [[C22:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A22]], i8 [[B22]])
+; CHECK-NEXT:    [[C23:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A23]], i8 [[B23]])
+; CHECK-NEXT:    [[C24:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A24]], i8 [[B24]])
+; CHECK-NEXT:    [[C25:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A25]], i8 [[B25]])
+; CHECK-NEXT:    [[C26:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A26]], i8 [[B26]])
+; CHECK-NEXT:    [[C27:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A27]], i8 [[B27]])
+; CHECK-NEXT:    [[C28:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A28]], i8 [[B28]])
+; CHECK-NEXT:    [[C29:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A29]], i8 [[B29]])
+; CHECK-NEXT:    [[C30:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A30]], i8 [[B30]])
+; CHECK-NEXT:    [[C31:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A31]], i8 [[B31]])
+; CHECK-NEXT:    [[C32:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A32]], i8 [[B32]])
+; CHECK-NEXT:    [[C33:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A33]], i8 [[B33]])
+; CHECK-NEXT:    [[C34:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A34]], i8 [[B34]])
+; CHECK-NEXT:    [[C35:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A35]], i8 [[B35]])
+; CHECK-NEXT:    [[C36:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A36]], i8 [[B36]])
+; CHECK-NEXT:    [[C37:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A37]], i8 [[B37]])
+; CHECK-NEXT:    [[C38:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A38]], i8 [[B38]])
+; CHECK-NEXT:    [[C39:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A39]], i8 [[B39]])
+; CHECK-NEXT:    [[C40:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A40]], i8 [[B40]])
+; CHECK-NEXT:    [[C41:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A41]], i8 [[B41]])
+; CHECK-NEXT:    [[C42:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A42]], i8 [[B42]])
+; CHECK-NEXT:    [[C43:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A43]], i8 [[B43]])
+; CHECK-NEXT:    [[C44:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A44]], i8 [[B44]])
+; CHECK-NEXT:    [[C45:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A45]], i8 [[B45]])
+; CHECK-NEXT:    [[C46:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A46]], i8 [[B46]])
+; CHECK-NEXT:    [[C47:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A47]], i8 [[B47]])
+; CHECK-NEXT:    [[C48:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A48]], i8 [[B48]])
+; CHECK-NEXT:    [[C49:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A49]], i8 [[B49]])
+; CHECK-NEXT:    [[C50:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A50]], i8 [[B50]])
+; CHECK-NEXT:    [[C51:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A51]], i8 [[B51]])
+; CHECK-NEXT:    [[C52:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A52]], i8 [[B52]])
+; CHECK-NEXT:    [[C53:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A53]], i8 [[B53]])
+; CHECK-NEXT:    [[C54:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A54]], i8 [[B54]])
+; CHECK-NEXT:    [[C55:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A55]], i8 [[B55]])
+; CHECK-NEXT:    [[C56:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A56]], i8 [[B56]])
+; CHECK-NEXT:    [[C57:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A57]], i8 [[B57]])
+; CHECK-NEXT:    [[C58:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A58]], i8 [[B58]])
+; CHECK-NEXT:    [[C59:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A59]], i8 [[B59]])
+; CHECK-NEXT:    [[C60:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A60]], i8 [[B60]])
+; CHECK-NEXT:    [[C61:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A61]], i8 [[B61]])
+; CHECK-NEXT:    [[C62:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A62]], i8 [[B62]])
+; CHECK-NEXT:    [[C63:%.*]] = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 [[A63]], i8 [[B63]])
+; CHECK-NEXT:    [[R0:%.*]] = extractvalue { i8, i1 } [[C0]], 0
+; CHECK-NEXT:    [[R1:%.*]] = extractvalue { i8, i1 } [[C1]], 0
+; CHECK-NEXT:    [[R2:%.*]] = extractvalue { i8, i1 } [[C2]], 0
+; CHECK-NEXT:    [[R3:%.*]] = extractvalue { i8, i1 } [[C3]], 0
+; CHECK-NEXT:    [[R4:%.*]] = extractvalue { i8, i1 } [[C4]], 0
+; CHECK-NEXT:    [[R5:%.*]] = extractvalue { i8, i1 } [[C5]], 0
+; CHECK-NEXT:    [[R6:%.*]] = extractvalue { i8, i1 } [[C6]], 0
+; CHECK-NEXT:    [[R7:%.*]] = extractvalue { i8, i1 } [[C7]], 0
+; CHECK-NEXT:    [[R8:%.*]] = extractvalue { i8, i1 } [[C8]], 0
+; CHECK-NEXT:    [[R9:%.*]] = extractvalue { i8, i1 } [[C9]], 0
+; CHECK-NEXT:    [[R10:%.*]] = extractvalue { i8, i1 } [[C10]], 0
+; CHECK-NEXT:    [[R11:%.*]] = extractvalue { i8, i1 } [[C11]], 0
+; CHECK-NEXT:    [[R12:%.*]] = extractvalue { i8, i1 } [[C12]], 0
+; CHECK-NEXT:    [[R13:%.*]] = extractvalue { i8, i1 } [[C13]], 0
+; CHECK-NEXT:    [[R14:%.*]] = extractvalue { i8, i1 } [[C14]], 0
+; CHECK-NEXT:    [[R15:%.*]] = extractvalue { i8, i1 } [[C15]], 0
+; CHECK-NEXT:    [[R16:%.*]] = extractvalue { i8, i1 } [[C16]], 0
+; CHECK-NEXT:    [[R17:%.*]] = extractvalue { i8, i1 } [[C17]], 0
+; CHECK-NEXT:    [[R18:%.*]] = extractvalue { i8, i1 } [[C18]], 0
+; CHECK-NEXT:    [[R19:%.*]] = extractvalue { i8, i1 } [[C19]], 0
+; CHECK-NEXT:    [[R20:%.*]] = extractvalue { i8, i1 } [[C20]], 0
+; CHECK-NEXT:    [[R21:%.*]] = extractvalue { i8, i1 } [[C21]], 0
+; CHECK-NEXT:    [[R22:%.*]] = extractvalue { i8, i1 } [[C22]], 0
+; CHECK-NEXT:    [[R23:%.*]] = extractvalue { i8, i1 } [[C23]], 0
+; CHECK-NEXT:    [[R24:%.*]] = extractvalue { i8, i1 } [[C24]], 0
+; CHECK-NEXT:    [[R25:%.*]] = extractvalue { i8, i1 } [[C25]], 0
+; CHECK-NEXT:    [[R26:%.*]] = extractvalue { i8, i1 } [[C26]], 0
+; CHECK-NEXT:    [[R27:%.*]] = extractvalue { i8, i1 } [[C27]], 0
+; CHECK-NEXT:    [[R28:%.*]] = extractvalue { i8, i1 } [[C28]], 0
+; CHECK-NEXT:    [[R29:%.*]] = extractvalue { i8, i1 } [[C29]], 0
+; CHECK-NEXT:    [[R30:%.*]] = extractvalue { i8, i1 } [[C30]], 0
+; CHECK-NEXT:    [[R31:%.*]] = extractvalue { i8, i1 } [[C31]], 0
+; CHECK-NEXT:    [[R32:%.*]] = extractvalue { i8, i1 } [[C32]], 0
+; CHECK-NEXT:    [[R33:%.*]] = extractvalue { i8, i1 } [[C33]], 0
+; CHECK-NEXT:    [[R34:%.*]] = extractvalue { i8, i1 } [[C34]], 0
+; CHECK-NEXT:    [[R35:%.*]] = extractvalue { i8, i1 } [[C35]], 0
+; CHECK-NEXT:    [[R36:%.*]] = extractvalue { i8, i1 } [[C36]], 0
+; CHECK-NEXT:    [[R37:%.*]] = extractvalue { i8, i1 } [[C37]], 0
+; CHECK-NEXT:    [[R38:%.*]] = extractvalue { i8, i1 } [[C38]], 0
+; CHECK-NEXT:    [[R39:%.*]] = extractvalue { i8, i1 } [[C39]], 0
+; CHECK-NEXT:    [[R40:%.*]] = extractvalue { i8, i1 } [[C40]], 0
+; CHECK-NEXT:    [[R41:%.*]] = extractvalue { i8, i1 } [[C41]], 0
+; CHECK-NEXT:    [[R42:%.*]] = extractvalue { i8, i1 } [[C42]], 0
+; CHECK-NEXT:    [[R43:%.*]] = extractvalue { i8, i1 } [[C43]], 0
+; CHECK-NEXT:    [[R44:%.*]] = extractvalue { i8, i1 } [[C44]], 0
+; CHECK-NEXT:    [[R45:%.*]] = extractvalue { i8, i1 } [[C45]], 0
+; CHECK-NEXT:    [[R46:%.*]] = extractvalue { i8, i1 } [[C46]], 0
+; CHECK-NEXT:    [[R47:%.*]] = extractvalue { i8, i1 } [[C47]], 0
+; CHECK-NEXT:    [[R48:%.*]] = extractvalue { i8, i1 } [[C48]], 0
+; CHECK-NEXT:    [[R49:%.*]] = extractvalue { i8, i1 } [[C49]], 0
+; CHECK-NEXT:    [[R50:%.*]] = extractvalue { i8, i1 } [[C50]], 0
+; CHECK-NEXT:    [[R51:%.*]] = extractvalue { i8, i1 } [[C51]], 0
+; CHECK-NEXT:    [[R52:%.*]] = extractvalue { i8, i1 } [[C52]], 0
+; CHECK-NEXT:    [[R53:%.*]] = extractvalue { i8, i1 } [[C53]], 0
+; CHECK-NEXT:    [[R54:%.*]] = extractvalue { i8, i1 } [[C54]], 0
+; CHECK-NEXT:    [[R55:%.*]] = extractvalue { i8, i1 } [[C55]], 0
+; CHECK-NEXT:    [[R56:%.*]] = extractvalue { i8, i1 } [[C56]], 0
+; CHECK-NEXT:    [[R57:%.*]] = extractvalue { i8, i1 } [[C57]], 0
+; CHECK-NEXT:    [[R58:%.*]] = extractvalue { i8, i1 } [[C58]], 0
+; CHECK-NEXT:    [[R59:%.*]] = extractvalue { i8, i1 } [[C59]], 0
+; CHECK-NEXT:    [[R60:%.*]] = extractvalue { i8, i1 } [[C60]], 0
+; CHECK-NEXT:    [[R61:%.*]] = extractvalue { i8, i1 } [[C61]], 0
+; CHECK-NEXT:    [[R62:%.*]] = extractvalue { i8, i1 } [[C62]], 0
+; CHECK-NEXT:    [[R63:%.*]] = extractvalue { i8, i1 } [[C63]], 0
+; CHECK-NEXT:    store i8 [[R0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[R1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[R2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[R3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[R4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[R5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[R6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[R7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7), align 1
+; CHECK-NEXT:    store i8 [[R8]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8), align 1
+; CHECK-NEXT:    store i8 [[R9]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9), align 1
+; CHECK-NEXT:    store i8 [[R10]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+; CHECK-NEXT:    store i8 [[R11]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+; CHECK-NEXT:    store i8 [[R12]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+; CHECK-NEXT:    store i8 [[R13]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+; CHECK-NEXT:    store i8 [[R14]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+; CHECK-NEXT:    store i8 [[R15]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+; CHECK-NEXT:    store i8 [[R16]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+; CHECK-NEXT:    store i8 [[R17]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+; CHECK-NEXT:    store i8 [[R18]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+; CHECK-NEXT:    store i8 [[R19]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+; CHECK-NEXT:    store i8 [[R20]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+; CHECK-NEXT:    store i8 [[R21]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+; CHECK-NEXT:    store i8 [[R22]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+; CHECK-NEXT:    store i8 [[R23]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+; CHECK-NEXT:    store i8 [[R24]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+; CHECK-NEXT:    store i8 [[R25]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+; CHECK-NEXT:    store i8 [[R26]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+; CHECK-NEXT:    store i8 [[R27]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+; CHECK-NEXT:    store i8 [[R28]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+; CHECK-NEXT:    store i8 [[R29]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+; CHECK-NEXT:    store i8 [[R30]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+; CHECK-NEXT:    store i8 [[R31]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+; CHECK-NEXT:    store i8 [[R32]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+; CHECK-NEXT:    store i8 [[R33]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+; CHECK-NEXT:    store i8 [[R34]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+; CHECK-NEXT:    store i8 [[R35]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+; CHECK-NEXT:    store i8 [[R36]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+; CHECK-NEXT:    store i8 [[R37]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+; CHECK-NEXT:    store i8 [[R38]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+; CHECK-NEXT:    store i8 [[R39]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+; CHECK-NEXT:    store i8 [[R40]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+; CHECK-NEXT:    store i8 [[R41]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+; CHECK-NEXT:    store i8 [[R42]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+; CHECK-NEXT:    store i8 [[R43]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+; CHECK-NEXT:    store i8 [[R44]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+; CHECK-NEXT:    store i8 [[R45]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+; CHECK-NEXT:    store i8 [[R46]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+; CHECK-NEXT:    store i8 [[R47]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+; CHECK-NEXT:    store i8 [[R48]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+; CHECK-NEXT:    store i8 [[R49]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+; CHECK-NEXT:    store i8 [[R50]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+; CHECK-NEXT:    store i8 [[R51]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+; CHECK-NEXT:    store i8 [[R52]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+; CHECK-NEXT:    store i8 [[R53]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+; CHECK-NEXT:    store i8 [[R54]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+; CHECK-NEXT:    store i8 [[R55]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+; CHECK-NEXT:    store i8 [[R56]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+; CHECK-NEXT:    store i8 [[R57]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+; CHECK-NEXT:    store i8 [[R58]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+; CHECK-NEXT:    store i8 [[R59]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+; CHECK-NEXT:    store i8 [[R60]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+; CHECK-NEXT:    store i8 [[R61]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+; CHECK-NEXT:    store i8 [[R62]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+; CHECK-NEXT:    store i8 [[R63]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %c0  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a0 , i8 %b0 )
+  %c1  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a1 , i8 %b1 )
+  %c2  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a2 , i8 %b2 )
+  %c3  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a3 , i8 %b3 )
+  %c4  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a4 , i8 %b4 )
+  %c5  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a5 , i8 %b5 )
+  %c6  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a6 , i8 %b6 )
+  %c7  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a7 , i8 %b7 )
+  %c8  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a8 , i8 %b8 )
+  %c9  = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a9 , i8 %b9 )
+  %c10 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a10, i8 %b10)
+  %c11 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a11, i8 %b11)
+  %c12 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a12, i8 %b12)
+  %c13 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a13, i8 %b13)
+  %c14 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a14, i8 %b14)
+  %c15 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a15, i8 %b15)
+  %c16 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a16, i8 %b16)
+  %c17 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a17, i8 %b17)
+  %c18 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a18, i8 %b18)
+  %c19 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a19, i8 %b19)
+  %c20 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a20, i8 %b20)
+  %c21 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a21, i8 %b21)
+  %c22 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a22, i8 %b22)
+  %c23 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a23, i8 %b23)
+  %c24 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a24, i8 %b24)
+  %c25 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a25, i8 %b25)
+  %c26 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a26, i8 %b26)
+  %c27 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a27, i8 %b27)
+  %c28 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a28, i8 %b28)
+  %c29 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a29, i8 %b29)
+  %c30 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a30, i8 %b30)
+  %c31 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a31, i8 %b31)
+  %c32 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a32, i8 %b32)
+  %c33 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a33, i8 %b33)
+  %c34 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a34, i8 %b34)
+  %c35 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a35, i8 %b35)
+  %c36 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a36, i8 %b36)
+  %c37 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a37, i8 %b37)
+  %c38 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a38, i8 %b38)
+  %c39 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a39, i8 %b39)
+  %c40 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a40, i8 %b40)
+  %c41 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a41, i8 %b41)
+  %c42 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a42, i8 %b42)
+  %c43 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a43, i8 %b43)
+  %c44 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a44, i8 %b44)
+  %c45 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a45, i8 %b45)
+  %c46 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a46, i8 %b46)
+  %c47 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a47, i8 %b47)
+  %c48 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a48, i8 %b48)
+  %c49 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a49, i8 %b49)
+  %c50 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a50, i8 %b50)
+  %c51 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a51, i8 %b51)
+  %c52 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a52, i8 %b52)
+  %c53 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a53, i8 %b53)
+  %c54 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a54, i8 %b54)
+  %c55 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a55, i8 %b55)
+  %c56 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a56, i8 %b56)
+  %c57 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a57, i8 %b57)
+  %c58 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a58, i8 %b58)
+  %c59 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a59, i8 %b59)
+  %c60 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a60, i8 %b60)
+  %c61 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a61, i8 %b61)
+  %c62 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a62, i8 %b62)
+  %c63 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %a63, i8 %b63)
+  %r0  = extractvalue {i8, i1} %c0 , 0
+  %r1  = extractvalue {i8, i1} %c1 , 0
+  %r2  = extractvalue {i8, i1} %c2 , 0
+  %r3  = extractvalue {i8, i1} %c3 , 0
+  %r4  = extractvalue {i8, i1} %c4 , 0
+  %r5  = extractvalue {i8, i1} %c5 , 0
+  %r6  = extractvalue {i8, i1} %c6 , 0
+  %r7  = extractvalue {i8, i1} %c7 , 0
+  %r8  = extractvalue {i8, i1} %c8 , 0
+  %r9  = extractvalue {i8, i1} %c9 , 0
+  %r10 = extractvalue {i8, i1} %c10, 0
+  %r11 = extractvalue {i8, i1} %c11, 0
+  %r12 = extractvalue {i8, i1} %c12, 0
+  %r13 = extractvalue {i8, i1} %c13, 0
+  %r14 = extractvalue {i8, i1} %c14, 0
+  %r15 = extractvalue {i8, i1} %c15, 0
+  %r16 = extractvalue {i8, i1} %c16, 0
+  %r17 = extractvalue {i8, i1} %c17, 0
+  %r18 = extractvalue {i8, i1} %c18, 0
+  %r19 = extractvalue {i8, i1} %c19, 0
+  %r20 = extractvalue {i8, i1} %c20, 0
+  %r21 = extractvalue {i8, i1} %c21, 0
+  %r22 = extractvalue {i8, i1} %c22, 0
+  %r23 = extractvalue {i8, i1} %c23, 0
+  %r24 = extractvalue {i8, i1} %c24, 0
+  %r25 = extractvalue {i8, i1} %c25, 0
+  %r26 = extractvalue {i8, i1} %c26, 0
+  %r27 = extractvalue {i8, i1} %c27, 0
+  %r28 = extractvalue {i8, i1} %c28, 0
+  %r29 = extractvalue {i8, i1} %c29, 0
+  %r30 = extractvalue {i8, i1} %c30, 0
+  %r31 = extractvalue {i8, i1} %c31, 0
+  %r32 = extractvalue {i8, i1} %c32, 0
+  %r33 = extractvalue {i8, i1} %c33, 0
+  %r34 = extractvalue {i8, i1} %c34, 0
+  %r35 = extractvalue {i8, i1} %c35, 0
+  %r36 = extractvalue {i8, i1} %c36, 0
+  %r37 = extractvalue {i8, i1} %c37, 0
+  %r38 = extractvalue {i8, i1} %c38, 0
+  %r39 = extractvalue {i8, i1} %c39, 0
+  %r40 = extractvalue {i8, i1} %c40, 0
+  %r41 = extractvalue {i8, i1} %c41, 0
+  %r42 = extractvalue {i8, i1} %c42, 0
+  %r43 = extractvalue {i8, i1} %c43, 0
+  %r44 = extractvalue {i8, i1} %c44, 0
+  %r45 = extractvalue {i8, i1} %c45, 0
+  %r46 = extractvalue {i8, i1} %c46, 0
+  %r47 = extractvalue {i8, i1} %c47, 0
+  %r48 = extractvalue {i8, i1} %c48, 0
+  %r49 = extractvalue {i8, i1} %c49, 0
+  %r50 = extractvalue {i8, i1} %c50, 0
+  %r51 = extractvalue {i8, i1} %c51, 0
+  %r52 = extractvalue {i8, i1} %c52, 0
+  %r53 = extractvalue {i8, i1} %c53, 0
+  %r54 = extractvalue {i8, i1} %c54, 0
+  %r55 = extractvalue {i8, i1} %c55, 0
+  %r56 = extractvalue {i8, i1} %c56, 0
+  %r57 = extractvalue {i8, i1} %c57, 0
+  %r58 = extractvalue {i8, i1} %c58, 0
+  %r59 = extractvalue {i8, i1} %c59, 0
+  %r60 = extractvalue {i8, i1} %c60, 0
+  %r61 = extractvalue {i8, i1} %c61, 0
+  %r62 = extractvalue {i8, i1} %c62, 0
+  %r63 = extractvalue {i8, i1} %c63, 0
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
new file mode 100644
index 00000000000..28c807408b7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll
@@ -0,0 +1,708 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @sub_v8i64() {
+; SSE-LABEL: @sub_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v8i64(
+; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v8i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = sub i64 %a0, %b0
+  %r1 = sub i64 %a1, %b1
+  %r2 = sub i64 %a2, %b2
+  %r3 = sub i64 %a3, %b3
+  %r4 = sub i64 %a4, %b4
+  %r5 = sub i64 %a5, %b5
+  %r6 = sub i64 %a6, %b6
+  %r7 = sub i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sub_v16i32() {
+; SSE-LABEL: @sub_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v16i32(
+; SLM-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SLM-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = sub i32 %a0 , %b0
+  %r1  = sub i32 %a1 , %b1
+  %r2  = sub i32 %a2 , %b2
+  %r3  = sub i32 %a3 , %b3
+  %r4  = sub i32 %a4 , %b4
+  %r5  = sub i32 %a5 , %b5
+  %r6  = sub i32 %a6 , %b6
+  %r7  = sub i32 %a7 , %b7
+  %r8  = sub i32 %a8 , %b8
+  %r9  = sub i32 %a9 , %b9
+  %r10 = sub i32 %a10, %b10
+  %r11 = sub i32 %a11, %b11
+  %r12 = sub i32 %a12, %b12
+  %r13 = sub i32 %a13, %b13
+  %r14 = sub i32 %a14, %b14
+  %r15 = sub i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sub_v32i16() {
+; SSE-LABEL: @sub_v32i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; SLM-LABEL: @sub_v32i16(
+; SLM-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
+; SLM-NEXT:    [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
+; SLM-NEXT:    [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
+; SLM-NEXT:    [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
+; SLM-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
+; SLM-NEXT:    store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
+; SLM-NEXT:    ret void
+;
+; AVX-LABEL: @sub_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @sub_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = sub i16 %a0 , %b0
+  %r1  = sub i16 %a1 , %b1
+  %r2  = sub i16 %a2 , %b2
+  %r3  = sub i16 %a3 , %b3
+  %r4  = sub i16 %a4 , %b4
+  %r5  = sub i16 %a5 , %b5
+  %r6  = sub i16 %a6 , %b6
+  %r7  = sub i16 %a7 , %b7
+  %r8  = sub i16 %a8 , %b8
+  %r9  = sub i16 %a9 , %b9
+  %r10 = sub i16 %a10, %b10
+  %r11 = sub i16 %a11, %b11
+  %r12 = sub i16 %a12, %b12
+  %r13 = sub i16 %a13, %b13
+  %r14 = sub i16 %a14, %b14
+  %r15 = sub i16 %a15, %b15
+  %r16 = sub i16 %a16, %b16
+  %r17 = sub i16 %a17, %b17
+  %r18 = sub i16 %a18, %b18
+  %r19 = sub i16 %a19, %b19
+  %r20 = sub i16 %a20, %b20
+  %r21 = sub i16 %a21, %b21
+  %r22 = sub i16 %a22, %b22
+  %r23 = sub i16 %a23, %b23
+  %r24 = sub i16 %a24, %b24
+  %r25 = sub i16 %a25, %b25
+  %r26 = sub i16 %a26, %b26
+  %r27 = sub i16 %a27, %b27
+  %r28 = sub i16 %a28, %b28
+  %r29 = sub i16 %a29, %b29
+  %r30 = sub i16 %a30, %b30
+  %r31 = sub i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @sub_v64i8() {
+; CHECK-LABEL: @sub_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = sub i8 %a0 , %b0
+  %r1  = sub i8 %a1 , %b1
+  %r2  = sub i8 %a2 , %b2
+  %r3  = sub i8 %a3 , %b3
+  %r4  = sub i8 %a4 , %b4
+  %r5  = sub i8 %a5 , %b5
+  %r6  = sub i8 %a6 , %b6
+  %r7  = sub i8 %a7 , %b7
+  %r8  = sub i8 %a8 , %b8
+  %r9  = sub i8 %a9 , %b9
+  %r10 = sub i8 %a10, %b10
+  %r11 = sub i8 %a11, %b11
+  %r12 = sub i8 %a12, %b12
+  %r13 = sub i8 %a13, %b13
+  %r14 = sub i8 %a14, %b14
+  %r15 = sub i8 %a15, %b15
+  %r16 = sub i8 %a16, %b16
+  %r17 = sub i8 %a17, %b17
+  %r18 = sub i8 %a18, %b18
+  %r19 = sub i8 %a19, %b19
+  %r20 = sub i8 %a20, %b20
+  %r21 = sub i8 %a21, %b21
+  %r22 = sub i8 %a22, %b22
+  %r23 = sub i8 %a23, %b23
+  %r24 = sub i8 %a24, %b24
+  %r25 = sub i8 %a25, %b25
+  %r26 = sub i8 %a26, %b26
+  %r27 = sub i8 %a27, %b27
+  %r28 = sub i8 %a28, %b28
+  %r29 = sub i8 %a29, %b29
+  %r30 = sub i8 %a30, %b30
+  %r31 = sub i8 %a31, %b31
+  %r32 = sub i8 %a32, %b32
+  %r33 = sub i8 %a33, %b33
+  %r34 = sub i8 %a34, %b34
+  %r35 = sub i8 %a35, %b35
+  %r36 = sub i8 %a36, %b36
+  %r37 = sub i8 %a37, %b37
+  %r38 = sub i8 %a38, %b38
+  %r39 = sub i8 %a39, %b39
+  %r40 = sub i8 %a40, %b40
+  %r41 = sub i8 %a41, %b41
+  %r42 = sub i8 %a42, %b42
+  %r43 = sub i8 %a43, %b43
+  %r44 = sub i8 %a44, %b44
+  %r45 = sub i8 %a45, %b45
+  %r46 = sub i8 %a46, %b46
+  %r47 = sub i8 %a47, %b47
+  %r48 = sub i8 %a48, %b48
+  %r49 = sub i8 %a49, %b49
+  %r50 = sub i8 %a50, %b50
+  %r51 = sub i8 %a51, %b51
+  %r52 = sub i8 %a52, %b52
+  %r53 = sub i8 %a53, %b53
+  %r54 = sub i8 %a54, %b54
+  %r55 = sub i8 %a55, %b55
+  %r56 = sub i8 %a56, %b56
+  %r57 = sub i8 %a57, %b57
+  %r58 = sub i8 %a58, %b58
+  %r59 = sub i8 %a59, %b59
+  %r60 = sub i8 %a60, %b60
+  %r61 = sub i8 %a61, %b61
+  %r62 = sub i8 %a62, %b62
+  %r63 = sub i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/atomics.ll b/llvm/test/Transforms/SLPVectorizer/X86/atomics.ll
new file mode 100644
index 00000000000..c7f05491a18
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/atomics.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@x = global [4 x i32] zeroinitializer, align 16
+@a = global [4 x i32] zeroinitializer, align 16
+
+; The SLPVectorizer should not vectorize atomic stores and it should not
+; schedule regular stores around atomic stores.
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 0), align 16
+; CHECK-NEXT:    store atomic i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 0) release, align 16
+; CHECK-NEXT:    store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 1), align 4
+; CHECK-NEXT:    store atomic i32 1, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 1) release, align 4
+; CHECK-NEXT:    store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 2), align 8
+; CHECK-NEXT:    store atomic i32 2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 2) release, align 8
+; CHECK-NEXT:    store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 3), align 4
+; CHECK-NEXT:    store atomic i32 3, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 3) release, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 0), align 16
+  store atomic i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 0) release, align 16
+  store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 1), align 4
+  store atomic i32 1, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 1) release, align 4
+  store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 2), align 8
+  store atomic i32 2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 2) release, align 8
+  store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 3), align 4
+  store atomic i32 3, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 3) release, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
new file mode 100644
index 00000000000..d229acd1789
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1(x86_mmx %a, x86_mmx %b, i64* %ptr) {
+; Ensure we can handle x86_mmx values which are primitive and can be bitcast
+; with integer types but can't be put into a vector.
+;
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
+; CHECK-NEXT:    [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
+; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
+; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i32 1
+; CHECK-NEXT:    store i64 [[A_AND]], i64* [[PTR]]
+; CHECK-NEXT:    store i64 [[B_AND]], i64* [[GEP]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.cast = bitcast x86_mmx %a to i64
+  %b.cast = bitcast x86_mmx %b to i64
+  %a.and = and i64 %a.cast, 42
+  %b.and = and i64 %b.cast, 42
+  %gep = getelementptr i64, i64* %ptr, i32 1
+  store i64 %a.and, i64* %ptr
+  store i64 %b.and, i64* %gep
+  ret void
+}
+
+define void @test2(x86_mmx %a, x86_mmx %b) {
+; Same as @test1 but using phi-input vectorization instead of store
+; vectorization.
+;
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64
+; CHECK-NEXT:    [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64
+; CHECK-NEXT:    [[A_AND:%.*]] = and i64 [[A_CAST]], 42
+; CHECK-NEXT:    [[B_AND:%.*]] = and i64 [[B_CAST]], 42
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[A_AND]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[B_PHI:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[B_AND]], [[IF_THEN]] ]
+; CHECK-NEXT:    tail call void @f(i64 [[A_PHI]], i64 [[B_PHI]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %if.then, label %exit
+
+if.then:
+  %a.cast = bitcast x86_mmx %a to i64
+  %b.cast = bitcast x86_mmx %b to i64
+  %a.and = and i64 %a.cast, 42
+  %b.and = and i64 %b.cast, 42
+  br label %exit
+
+exit:
+  %a.phi = phi i64 [ 0, %entry ], [ %a.and, %if.then ]
+  %b.phi = phi i64 [ 0, %entry ], [ %b.and, %if.then ]
+  tail call void @f(i64 %a.phi, i64 %b.phi)
+  ret void
+}
+
+define i8 @test3(i8 *%addr) {
+; Check that we do not vectorize types that are padded to a bigger ones.
+;
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = bitcast i8* [[ADDR:%.*]] to i2*
+; CHECK-NEXT:    [[A0:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 0
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 2
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 3
+; CHECK-NEXT:    [[L0:%.*]] = load i2, i2* [[A0]], align 1
+; CHECK-NEXT:    [[L1:%.*]] = load i2, i2* [[A1]], align 1
+; CHECK-NEXT:    [[L2:%.*]] = load i2, i2* [[A2]], align 1
+; CHECK-NEXT:    [[L3:%.*]] = load i2, i2* [[A3]], align 1
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[P0:%.*]] = phi i2 [ [[L0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[P1:%.*]] = phi i2 [ [[L1]], [[ENTRY]] ]
+; CHECK-NEXT:    [[P2:%.*]] = phi i2 [ [[L2]], [[ENTRY]] ]
+; CHECK-NEXT:    [[P3:%.*]] = phi i2 [ [[L3]], [[ENTRY]] ]
+; CHECK-NEXT:    [[R:%.*]] = zext i2 [[P2]] to i8
+; CHECK-NEXT:    ret i8 [[R]]
+;
+entry:
+  %a = bitcast i8* %addr to i2*
+  %a0 = getelementptr inbounds i2, i2* %a, i64 0
+  %a1 = getelementptr inbounds i2, i2* %a, i64 1
+  %a2 = getelementptr inbounds i2, i2* %a, i64 2
+  %a3 = getelementptr inbounds i2, i2* %a, i64 3
+  %l0 = load i2, i2* %a0, align 1
+  %l1 = load i2, i2* %a1, align 1
+  %l2 = load i2, i2* %a2, align 1
+  %l3 = load i2, i2* %a3, align 1
+  br label %bb1
+bb1:                                              ; preds = %entry
+  %p0 = phi i2 [ %l0, %entry ]
+  %p1 = phi i2 [ %l1, %entry ]
+  %p2 = phi i2 [ %l2, %entry ]
+  %p3 = phi i2 [ %l3, %entry ]
+  %r  = zext i2 %p2 to i8
+  ret i8 %r
+}
+
+declare void @f(i64, i64)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
new file mode 100644
index 00000000000..2ea29ed95c7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @foo(i32* nocapture %A, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[N]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[N]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %call = tail call i32 (...) @bar() #2
+  %mul = mul nsw i32 %n, 5
+  %add = add nsw i32 %mul, 9
+  store i32 %add, i32* %A, align 4
+  %mul1 = mul nsw i32 %n, 9
+  %add2 = add nsw i32 %mul1, 9
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+  store i32 %add2, i32* %arrayidx3, align 4
+  %mul4 = shl i32 %n, 3
+  %add5 = add nsw i32 %mul4, 9
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 2
+  store i32 %add5, i32* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %n, 10
+  %add8 = add nsw i32 %mul7, 9
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 3
+  store i32 %add8, i32* %arrayidx9, align 4
+  ret i32 undef
+}
+
+  ; We can still vectorize the stores below.
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
new file mode 100644
index 00000000000..749e93b0413
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll
@@ -0,0 +1,423 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [4 x i64] zeroinitializer, align 32
+@dst64 = common global [4 x i64] zeroinitializer, align 32
+@src32 = common global [8 x i32] zeroinitializer, align 32
+@dst32 = common global [8 x i32] zeroinitializer, align 32
+@src16 = common global [16 x i16] zeroinitializer, align 32
+@dst16 = common global [16 x i16] zeroinitializer, align 32
+@src8  = common global [32 x i8] zeroinitializer, align 32
+@dst8  = common global [32 x i8] zeroinitializer, align 32
+
+declare i64 @llvm.bitreverse.i64(i64)
+declare i32 @llvm.bitreverse.i32(i32)
+declare i16 @llvm.bitreverse.i16(i16)
+declare  i8 @llvm.bitreverse.i8(i8)
+
+define void @bitreverse_2i64() #0 {
+; CHECK-LABEL: @bitreverse_2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+  %bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
+  %bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
+  store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @bitreverse_4i64() #0 {
+; SSE-LABEL: @bitreverse_4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP2]])
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @bitreverse_4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
+; AVX-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
+;
+; XOP-LABEL: @bitreverse_4i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
+; XOP-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; XOP-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+  %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+  %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+  %bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
+  %bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
+  %bitreverse2 = call i64 @llvm.bitreverse.i64(i64 %ld2)
+  %bitreverse3 = call i64 @llvm.bitreverse.i64(i64 %ld3)
+  store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+  store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+  store i64 %bitreverse2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+  store i64 %bitreverse3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+  ret void
+}
+
+define void @bitreverse_4i32() #0 {
+; CHECK-LABEL: @bitreverse_4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+  %bitreverse0 = call i32 @llvm.bitreverse.i32(i32 %ld0)
+  %bitreverse1 = call i32 @llvm.bitreverse.i32(i32 %ld1)
+  %bitreverse2 = call i32 @llvm.bitreverse.i32(i32 %ld2)
+  %bitreverse3 = call i32 @llvm.bitreverse.i32(i32 %ld3)
+  store i32 %bitreverse0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %bitreverse1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %bitreverse2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %bitreverse3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @bitreverse_8i32() #0 {
+; SSE-LABEL: @bitreverse_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @bitreverse_8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX-NEXT:    ret void
+;
+; XOP-LABEL: @bitreverse_8i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
+; XOP-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; XOP-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+  %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+  %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+  %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+  %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+  %bitreverse0 = call i32 @llvm.bitreverse.i32(i32 %ld0)
+  %bitreverse1 = call i32 @llvm.bitreverse.i32(i32 %ld1)
+  %bitreverse2 = call i32 @llvm.bitreverse.i32(i32 %ld2)
+  %bitreverse3 = call i32 @llvm.bitreverse.i32(i32 %ld3)
+  %bitreverse4 = call i32 @llvm.bitreverse.i32(i32 %ld4)
+  %bitreverse5 = call i32 @llvm.bitreverse.i32(i32 %ld5)
+  %bitreverse6 = call i32 @llvm.bitreverse.i32(i32 %ld6)
+  %bitreverse7 = call i32 @llvm.bitreverse.i32(i32 %ld7)
+  store i32 %bitreverse0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+  store i32 %bitreverse1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+  store i32 %bitreverse2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+  store i32 %bitreverse3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+  store i32 %bitreverse4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+  store i32 %bitreverse5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+  store i32 %bitreverse6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+  store i32 %bitreverse7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @bitreverse_8i16() #0 {
+; CHECK-LABEL: @bitreverse_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
+  %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
+  %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
+  %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
+  %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
+  %bitreverse0 = call i16 @llvm.bitreverse.i16(i16 %ld0)
+  %bitreverse1 = call i16 @llvm.bitreverse.i16(i16 %ld1)
+  %bitreverse2 = call i16 @llvm.bitreverse.i16(i16 %ld2)
+  %bitreverse3 = call i16 @llvm.bitreverse.i16(i16 %ld3)
+  %bitreverse4 = call i16 @llvm.bitreverse.i16(i16 %ld4)
+  %bitreverse5 = call i16 @llvm.bitreverse.i16(i16 %ld5)
+  %bitreverse6 = call i16 @llvm.bitreverse.i16(i16 %ld6)
+  %bitreverse7 = call i16 @llvm.bitreverse.i16(i16 %ld7)
+  store i16 %bitreverse0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
+  store i16 %bitreverse1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
+  store i16 %bitreverse2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
+  store i16 %bitreverse3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
+  store i16 %bitreverse4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
+  store i16 %bitreverse5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
+  store i16 %bitreverse6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
+  store i16 %bitreverse7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
+  ret void
+}
+
+define void @bitreverse_16i16() #0 {
+; SSE-LABEL: @bitreverse_16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @bitreverse_16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; XOP-LABEL: @bitreverse_16i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
+; XOP-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
+  %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
+  %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
+  %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
+  %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
+  %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
+  %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
+  %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
+  %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
+  %bitreverse0  = call i16 @llvm.bitreverse.i16(i16 %ld0)
+  %bitreverse1  = call i16 @llvm.bitreverse.i16(i16 %ld1)
+  %bitreverse2  = call i16 @llvm.bitreverse.i16(i16 %ld2)
+  %bitreverse3  = call i16 @llvm.bitreverse.i16(i16 %ld3)
+  %bitreverse4  = call i16 @llvm.bitreverse.i16(i16 %ld4)
+  %bitreverse5  = call i16 @llvm.bitreverse.i16(i16 %ld5)
+  %bitreverse6  = call i16 @llvm.bitreverse.i16(i16 %ld6)
+  %bitreverse7  = call i16 @llvm.bitreverse.i16(i16 %ld7)
+  %bitreverse8  = call i16 @llvm.bitreverse.i16(i16 %ld8)
+  %bitreverse9  = call i16 @llvm.bitreverse.i16(i16 %ld9)
+  %bitreverse10 = call i16 @llvm.bitreverse.i16(i16 %ld10)
+  %bitreverse11 = call i16 @llvm.bitreverse.i16(i16 %ld11)
+  %bitreverse12 = call i16 @llvm.bitreverse.i16(i16 %ld12)
+  %bitreverse13 = call i16 @llvm.bitreverse.i16(i16 %ld13)
+  %bitreverse14 = call i16 @llvm.bitreverse.i16(i16 %ld14)
+  %bitreverse15 = call i16 @llvm.bitreverse.i16(i16 %ld15)
+  store i16 %bitreverse0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
+  store i16 %bitreverse1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
+  store i16 %bitreverse2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
+  store i16 %bitreverse3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
+  store i16 %bitreverse4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
+  store i16 %bitreverse5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
+  store i16 %bitreverse6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
+  store i16 %bitreverse7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
+  store i16 %bitreverse8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
+  store i16 %bitreverse9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
+  store i16 %bitreverse10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
+  store i16 %bitreverse11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
+  store i16 %bitreverse12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
+  store i16 %bitreverse13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
+  store i16 %bitreverse14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
+  store i16 %bitreverse15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+  ret void
+}
+
+define void @bitreverse_16i8() #0 {
+; CHECK-LABEL: @bitreverse_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %bitreverse0  = call i8 @llvm.bitreverse.i8(i8 %ld0)
+  %bitreverse1  = call i8 @llvm.bitreverse.i8(i8 %ld1)
+  %bitreverse2  = call i8 @llvm.bitreverse.i8(i8 %ld2)
+  %bitreverse3  = call i8 @llvm.bitreverse.i8(i8 %ld3)
+  %bitreverse4  = call i8 @llvm.bitreverse.i8(i8 %ld4)
+  %bitreverse5  = call i8 @llvm.bitreverse.i8(i8 %ld5)
+  %bitreverse6  = call i8 @llvm.bitreverse.i8(i8 %ld6)
+  %bitreverse7  = call i8 @llvm.bitreverse.i8(i8 %ld7)
+  %bitreverse8  = call i8 @llvm.bitreverse.i8(i8 %ld8)
+  %bitreverse9  = call i8 @llvm.bitreverse.i8(i8 %ld9)
+  %bitreverse10 = call i8 @llvm.bitreverse.i8(i8 %ld10)
+  %bitreverse11 = call i8 @llvm.bitreverse.i8(i8 %ld11)
+  %bitreverse12 = call i8 @llvm.bitreverse.i8(i8 %ld12)
+  %bitreverse13 = call i8 @llvm.bitreverse.i8(i8 %ld13)
+  %bitreverse14 = call i8 @llvm.bitreverse.i8(i8 %ld14)
+  %bitreverse15 = call i8 @llvm.bitreverse.i8(i8 %ld15)
+  store i8 %bitreverse0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %bitreverse1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %bitreverse2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %bitreverse3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %bitreverse4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %bitreverse5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %bitreverse6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %bitreverse7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %bitreverse8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %bitreverse9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %bitreverse10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %bitreverse11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %bitreverse12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %bitreverse13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %bitreverse14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %bitreverse15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  ret void
+}
+
+define void @bitreverse_32i8() #0 {
+; CHECK-LABEL: @bitreverse_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
+  %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
+  %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
+  %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
+  %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
+  %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
+  %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
+  %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
+  %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
+  %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
+  %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
+  %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
+  %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
+  %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
+  %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
+  %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
+  %bitreverse0  = call i8 @llvm.bitreverse.i8(i8 %ld0)
+  %bitreverse1  = call i8 @llvm.bitreverse.i8(i8 %ld1)
+  %bitreverse2  = call i8 @llvm.bitreverse.i8(i8 %ld2)
+  %bitreverse3  = call i8 @llvm.bitreverse.i8(i8 %ld3)
+  %bitreverse4  = call i8 @llvm.bitreverse.i8(i8 %ld4)
+  %bitreverse5  = call i8 @llvm.bitreverse.i8(i8 %ld5)
+  %bitreverse6  = call i8 @llvm.bitreverse.i8(i8 %ld6)
+  %bitreverse7  = call i8 @llvm.bitreverse.i8(i8 %ld7)
+  %bitreverse8  = call i8 @llvm.bitreverse.i8(i8 %ld8)
+  %bitreverse9  = call i8 @llvm.bitreverse.i8(i8 %ld9)
+  %bitreverse10 = call i8 @llvm.bitreverse.i8(i8 %ld10)
+  %bitreverse11 = call i8 @llvm.bitreverse.i8(i8 %ld11)
+  %bitreverse12 = call i8 @llvm.bitreverse.i8(i8 %ld12)
+  %bitreverse13 = call i8 @llvm.bitreverse.i8(i8 %ld13)
+  %bitreverse14 = call i8 @llvm.bitreverse.i8(i8 %ld14)
+  %bitreverse15 = call i8 @llvm.bitreverse.i8(i8 %ld15)
+  %bitreverse16 = call i8 @llvm.bitreverse.i8(i8 %ld16)
+  %bitreverse17 = call i8 @llvm.bitreverse.i8(i8 %ld17)
+  %bitreverse18 = call i8 @llvm.bitreverse.i8(i8 %ld18)
+  %bitreverse19 = call i8 @llvm.bitreverse.i8(i8 %ld19)
+  %bitreverse20 = call i8 @llvm.bitreverse.i8(i8 %ld20)
+  %bitreverse21 = call i8 @llvm.bitreverse.i8(i8 %ld21)
+  %bitreverse22 = call i8 @llvm.bitreverse.i8(i8 %ld22)
+  %bitreverse23 = call i8 @llvm.bitreverse.i8(i8 %ld23)
+  %bitreverse24 = call i8 @llvm.bitreverse.i8(i8 %ld24)
+  %bitreverse25 = call i8 @llvm.bitreverse.i8(i8 %ld25)
+  %bitreverse26 = call i8 @llvm.bitreverse.i8(i8 %ld26)
+  %bitreverse27 = call i8 @llvm.bitreverse.i8(i8 %ld27)
+  %bitreverse28 = call i8 @llvm.bitreverse.i8(i8 %ld28)
+  %bitreverse29 = call i8 @llvm.bitreverse.i8(i8 %ld29)
+  %bitreverse30 = call i8 @llvm.bitreverse.i8(i8 %ld30)
+  %bitreverse31 = call i8 @llvm.bitreverse.i8(i8 %ld31)
+  store i8 %bitreverse0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %bitreverse1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %bitreverse2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %bitreverse3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %bitreverse4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %bitreverse5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %bitreverse6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %bitreverse7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %bitreverse8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %bitreverse9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %bitreverse10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %bitreverse11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %bitreverse12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %bitreverse13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %bitreverse14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %bitreverse15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  store i8 %bitreverse16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
+  store i8 %bitreverse17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
+  store i8 %bitreverse18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
+  store i8 %bitreverse19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
+  store i8 %bitreverse20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
+  store i8 %bitreverse21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
+  store i8 %bitreverse22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
+  store i8 %bitreverse23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
+  store i8 %bitreverse24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
+  store i8 %bitreverse25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
+  store i8 %bitreverse26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
+  store i8 %bitreverse27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
+  store i8 %bitreverse28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
+  store i8 %bitreverse29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
+  store i8 %bitreverse30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
+  store i8 %bitreverse31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
new file mode 100644
index 00000000000..9189fb4d18e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -instcombine | FileCheck %s
+
+define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[INS2]]
+;
+  %x0 = extractelement <2 x i8> %x, i32 0
+  %y1 = extractelement <2 x i8> %y, i32 1
+  %x0x0 = mul i8 %x0, %x0
+  %y1y1 = mul i8 %y1, %y1
+  %ins1 = insertelement <2 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1
+  ret <2 x i8> %ins2
+}
+
+define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 undef, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
+;
+  %x0 = extractelement <4 x i8> undef, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define i8 @i(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @i(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i8> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i8> [[BIN_RDX]], <4 x i8> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i8> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = add i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @j(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @j(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @k(<4 x i8> %x) {
+; CHECK-LABEL: @k(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %x1 = extractelement <4 x i8> %x, i32 1
+  %x2 = extractelement <4 x i8> %x, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %x1x1 = mul i8 %x1, %x1
+  %x2x2 = mul i8 %x2, %x2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %x1x1, %x2x2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @k_bb(<4 x i8> %x) {
+; CHECK-LABEL: @k_bb(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  br label %bb1
+bb1:
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %x1 = extractelement <4 x i8> %x, i32 1
+  %x2 = extractelement <4 x i8> %x, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %x1x1 = mul i8 %x1, %x1
+  %x2x2 = mul i8 %x2, %x2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %x1x1, %x2x2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
new file mode 100644
index 00000000000..79ce24f5911
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [4 x i64] zeroinitializer, align 32
+@dst64 = common global [4 x i64] zeroinitializer, align 32
+@src32 = common global [8 x i32] zeroinitializer, align 32
+@dst32 = common global [8 x i32] zeroinitializer, align 32
+@src16 = common global [16 x i16] zeroinitializer, align 32
+@dst16 = common global [16 x i16] zeroinitializer, align 32
+
+declare i64 @llvm.bswap.i64(i64)
+declare i32 @llvm.bswap.i32(i32)
+declare i16 @llvm.bswap.i16(i16)
+
+define void @bswap_2i64() #0 {
+; SSE-LABEL: @bswap_2i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]])
+; SSE-NEXT:    [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])
+; SSE-NEXT:    store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @bswap_2i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP1]])
+; AVX-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+  %bswap0 = call i64 @llvm.bswap.i64(i64 %ld0)
+  %bswap1 = call i64 @llvm.bswap.i64(i64 %ld1)
+  store i64 %bswap0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %bswap1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @bswap_4i64() #0 {
+; SSE-LABEL: @bswap_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[BSWAP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD0]])
+; SSE-NEXT:    [[BSWAP1:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD1]])
+; SSE-NEXT:    [[BSWAP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD2]])
+; SSE-NEXT:    [[BSWAP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[LD3]])
+; SSE-NEXT:    store i64 [[BSWAP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; SSE-NEXT:    store i64 [[BSWAP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; SSE-NEXT:    store i64 [[BSWAP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; SSE-NEXT:    store i64 [[BSWAP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @bswap_4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> [[TMP1]])
+; AVX-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+  %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+  %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+  %bswap0 = call i64 @llvm.bswap.i64(i64 %ld0)
+  %bswap1 = call i64 @llvm.bswap.i64(i64 %ld1)
+  %bswap2 = call i64 @llvm.bswap.i64(i64 %ld2)
+  %bswap3 = call i64 @llvm.bswap.i64(i64 %ld3)
+  store i64 %bswap0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+  store i64 %bswap1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+  store i64 %bswap2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+  store i64 %bswap3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+  ret void
+}
+
+define void @bswap_4i32() #0 {
+; CHECK-LABEL: @bswap_4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+  %bswap0 = call i32 @llvm.bswap.i32(i32 %ld0)
+  %bswap1 = call i32 @llvm.bswap.i32(i32 %ld1)
+  %bswap2 = call i32 @llvm.bswap.i32(i32 %ld2)
+  %bswap3 = call i32 @llvm.bswap.i32(i32 %ld3)
+  store i32 %bswap0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %bswap1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %bswap2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %bswap3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @bswap_8i32() #0 {
+; SSE-LABEL: @bswap_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @bswap_8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> [[TMP1]])
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+  %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+  %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+  %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+  %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+  %bswap0 = call i32 @llvm.bswap.i32(i32 %ld0)
+  %bswap1 = call i32 @llvm.bswap.i32(i32 %ld1)
+  %bswap2 = call i32 @llvm.bswap.i32(i32 %ld2)
+  %bswap3 = call i32 @llvm.bswap.i32(i32 %ld3)
+  %bswap4 = call i32 @llvm.bswap.i32(i32 %ld4)
+  %bswap5 = call i32 @llvm.bswap.i32(i32 %ld5)
+  %bswap6 = call i32 @llvm.bswap.i32(i32 %ld6)
+  %bswap7 = call i32 @llvm.bswap.i32(i32 %ld7)
+  store i32 %bswap0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+  store i32 %bswap1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+  store i32 %bswap2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+  store i32 %bswap3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+  store i32 %bswap4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+  store i32 %bswap5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+  store i32 %bswap6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+  store i32 %bswap7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @bswap_8i16() #0 {
+; CHECK-LABEL: @bswap_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
+  %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
+  %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
+  %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
+  %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
+  %bswap0 = call i16 @llvm.bswap.i16(i16 %ld0)
+  %bswap1 = call i16 @llvm.bswap.i16(i16 %ld1)
+  %bswap2 = call i16 @llvm.bswap.i16(i16 %ld2)
+  %bswap3 = call i16 @llvm.bswap.i16(i16 %ld3)
+  %bswap4 = call i16 @llvm.bswap.i16(i16 %ld4)
+  %bswap5 = call i16 @llvm.bswap.i16(i16 %ld5)
+  %bswap6 = call i16 @llvm.bswap.i16(i16 %ld6)
+  %bswap7 = call i16 @llvm.bswap.i16(i16 %ld7)
+  store i16 %bswap0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
+  store i16 %bswap1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
+  store i16 %bswap2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
+  store i16 %bswap3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
+  store i16 %bswap4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
+  store i16 %bswap5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
+  store i16 %bswap6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
+  store i16 %bswap7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
+  ret void
+}
+
+define void @bswap_16i16() #0 {
+; SSE-LABEL: @bswap_16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @bswap_16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> [[TMP1]])
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
+  %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
+  %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
+  %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
+  %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
+  %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
+  %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
+  %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
+  %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
+  %bswap0  = call i16 @llvm.bswap.i16(i16 %ld0)
+  %bswap1  = call i16 @llvm.bswap.i16(i16 %ld1)
+  %bswap2  = call i16 @llvm.bswap.i16(i16 %ld2)
+  %bswap3  = call i16 @llvm.bswap.i16(i16 %ld3)
+  %bswap4  = call i16 @llvm.bswap.i16(i16 %ld4)
+  %bswap5  = call i16 @llvm.bswap.i16(i16 %ld5)
+  %bswap6  = call i16 @llvm.bswap.i16(i16 %ld6)
+  %bswap7  = call i16 @llvm.bswap.i16(i16 %ld7)
+  %bswap8  = call i16 @llvm.bswap.i16(i16 %ld8)
+  %bswap9  = call i16 @llvm.bswap.i16(i16 %ld9)
+  %bswap10 = call i16 @llvm.bswap.i16(i16 %ld10)
+  %bswap11 = call i16 @llvm.bswap.i16(i16 %ld11)
+  %bswap12 = call i16 @llvm.bswap.i16(i16 %ld12)
+  %bswap13 = call i16 @llvm.bswap.i16(i16 %ld13)
+  %bswap14 = call i16 @llvm.bswap.i16(i16 %ld14)
+  %bswap15 = call i16 @llvm.bswap.i16(i16 %ld15)
+  store i16 %bswap0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
+  store i16 %bswap1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
+  store i16 %bswap2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
+  store i16 %bswap3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
+  store i16 %bswap4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
+  store i16 %bswap5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
+  store i16 %bswap6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
+  store i16 %bswap7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
+  store i16 %bswap8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
+  store i16 %bswap9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
+  store i16 %bswap10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
+  store i16 %bswap11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
+  store i16 %bswap12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
+  store i16 %bswap13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
+  store i16 %bswap14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
+  store i16 %bswap15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/call.ll b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
new file mode 100644
index 00000000000..c93397c0337
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/call.ll
@@ -0,0 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare double @sin(double)
+declare double @cos(double)
+declare double @pow(double, double)
+declare double @exp2(double)
+declare double @sqrt(double)
+declare i64 @round(i64)
+
+
+define void @sin_libm(double* %a, double* %b) {
+; CHECK-LABEL: @sin_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %sin1 = tail call double @sin(double %a0) nounwind readnone
+  %sin2 = tail call double @sin(double %a1) nounwind readnone
+  store double %sin1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %sin2, double* %idx2, align 8
+  ret void
+}
+
+define void @cos_libm(double* %a, double* %b) {
+; CHECK-LABEL: @cos_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %cos1 = tail call double @cos(double %a0) nounwind readnone
+  %cos2 = tail call double @cos(double %a1) nounwind readnone
+  store double %cos1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %cos2, double* %idx2, align 8
+  ret void
+}
+
+define void @pow_libm(double* %a, double* %b) {
+; CHECK-LABEL: @pow_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %pow1 = tail call double @pow(double %a0, double %a0) nounwind readnone
+  %pow2 = tail call double @pow(double %a1, double %a1) nounwind readnone
+  store double %pow1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %pow2, double* %idx2, align 8
+  ret void
+}
+
+define void @exp_libm(double* %a, double* %b) {
+; CHECK-LABEL: @exp_libm(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %exp1 = tail call double @exp2(double %a0) nounwind readnone
+  %exp2 = tail call double @exp2(double %a1) nounwind readnone
+  store double %exp1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %exp2, double* %idx2, align 8
+  ret void
+}
+
+; No fast-math-flags are required to convert sqrt library calls to an intrinsic.
+; We just need to know that errno is not set (readnone).
+
+define void @sqrt_libm_no_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_no_errno(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %sqrt1 = tail call double @sqrt(double %a0) nounwind readnone
+  %sqrt2 = tail call double @sqrt(double %a1) nounwind readnone
+  store double %sqrt1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %sqrt2, double* %idx2, align 8
+  ret void
+}
+
+; The sqrt intrinsic does not set errno, but a non-constant sqrt call might, so this can't vectorize.
+; The nnan on the call does not matter because there's no guarantee in the C standard that a negative
+; input would result in a nan output ("On a domain error, the function returns an
+; implementation-defined value.")
+
+define void @sqrt_libm_errno(double* %a, double* %b) {
+; CHECK-LABEL: @sqrt_libm_errno(
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDX1]], align 8
+; CHECK-NEXT:    [[SQRT1:%.*]] = tail call nnan double @sqrt(double [[A0]]) #2
+; CHECK-NEXT:    [[SQRT2:%.*]] = tail call nnan double @sqrt(double [[A1]]) #2
+; CHECK-NEXT:    store double [[SQRT1]], double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    store double [[SQRT2]], double* [[IDX2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* %a, align 8
+  %idx1 = getelementptr inbounds double, double* %a, i64 1
+  %a1 = load double, double* %idx1, align 8
+  %sqrt1 = tail call nnan double @sqrt(double %a0) nounwind
+  %sqrt2 = tail call nnan double @sqrt(double %a1) nounwind
+  store double %sqrt1, double* %b, align 8
+  %idx2 = getelementptr inbounds double, double* %b, i64 1
+  store double %sqrt2, double* %idx2, align 8
+  ret void
+}
+
+; Negative test case
+define void @round_custom(i64* %a, i64* %b) {
+; CHECK-LABEL: @round_custom(
+; CHECK-NEXT:    [[A0:%.*]] = load i64, i64* [[A:%.*]], align 8
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1
+; CHECK-NEXT:    [[A1:%.*]] = load i64, i64* [[IDX1]], align 8
+; CHECK-NEXT:    [[ROUND1:%.*]] = tail call i64 @round(i64 [[A0]]) #3
+; CHECK-NEXT:    [[ROUND2:%.*]] = tail call i64 @round(i64 [[A1]]) #3
+; CHECK-NEXT:    store i64 [[ROUND1]], i64* [[B:%.*]], align 8
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1
+; CHECK-NEXT:    store i64 [[ROUND2]], i64* [[IDX2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load i64, i64* %a, align 8
+  %idx1 = getelementptr inbounds i64, i64* %a, i64 1
+  %a1 = load i64, i64* %idx1, align 8
+  %round1 = tail call i64 @round(i64 %a0) nounwind readnone
+  %round2 = tail call i64 @round(i64 %a1) nounwind readnone
+  store i64 %round1, i64* %b, align 8
+  %idx2 = getelementptr inbounds i64, i64* %b, i64 1
+  store i64 %round2, i64* %idx2, align 8
+  ret void
+}
+
+
+; CHECK: declare <2 x double> @llvm.sin.v2f64(<2 x double>) [[ATTR0:#[0-9]+]]
+; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) [[ATTR0]]
+; CHECK: declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>) [[ATTR0]]
+; CHECK: declare <2 x double> @llvm.exp2.v2f64(<2 x double>) [[ATTR0]]
+
+; CHECK: attributes [[ATTR0]] = { nounwind readnone speculatable }
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast.ll
new file mode 100644
index 00000000000..2f9f84948ea
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cast.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; int test_sext_4i8_to_4i32(int * restrict A, char * restrict B) {
+;     A[0] = B[0];
+;     A[1] = B[1];
+;     A[2] = B[2];
+;     A[3] = B[3];
+; }
+
+define i32 @test_sext_4i8_to_4i32(i32* noalias nocapture %A, i8* noalias nocapture %B) {
+; CHECK-LABEL: @test_sext_4i8_to_4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i8, i8* %B, align 1
+  %conv = sext i8 %0 to i32
+  store i32 %conv, i32* %A, align 4
+  %arrayidx2 = getelementptr inbounds i8, i8* %B, i64 1
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = sext i8 %1 to i32
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 1
+  store i32 %conv3, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i8, i8* %B, i64 2
+  %2 = load i8, i8* %arrayidx5, align 1
+  %conv6 = sext i8 %2 to i32
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 2
+  store i32 %conv6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i8, i8* %B, i64 3
+  %3 = load i8, i8* %arrayidx8, align 1
+  %conv9 = sext i8 %3 to i32
+  %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 3
+  store i32 %conv9, i32* %arrayidx10, align 4
+  ret i32 undef
+}
+
+define i32 @test_zext_4i16_to_4i32(i32* noalias nocapture %A, i16* noalias nocapture %B) {
+; CHECK-LABEL: @test_zext_4i16_to_4i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i16, i16* %B, align 1
+  %conv = zext i16 %0 to i32
+  store i32 %conv, i32* %A, align 4
+  %arrayidx2 = getelementptr inbounds i16, i16* %B, i64 1
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = zext i16 %1 to i32
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 1
+  store i32 %conv3, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i16, i16* %B, i64 2
+  %2 = load i16, i16* %arrayidx5, align 1
+  %conv6 = zext i16 %2 to i32
+  %arrayidx7 = getelementptr inbounds i32, i32* %A, i64 2
+  store i32 %conv6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i16, i16* %B, i64 3
+  %3 = load i16, i16* %arrayidx8, align 1
+  %conv9 = zext i16 %3 to i32
+  %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 3
+  store i32 %conv9, i32* %arrayidx10, align 4
+  ret i32 undef
+}
+
+define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) {
+; CHECK-LABEL: @test_sext_4i16_to_4i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
+; CHECK-NEXT:    ret i64 undef
+;
+entry:
+  %0 = load i16, i16* %B, align 1
+  %conv = sext i16 %0 to i64
+  store i64 %conv, i64* %A, align 4
+  %arrayidx2 = getelementptr inbounds i16, i16* %B, i64 1
+  %1 = load i16, i16* %arrayidx2, align 1
+  %conv3 = sext i16 %1 to i64
+  %arrayidx4 = getelementptr inbounds i64, i64* %A, i64 1
+  store i64 %conv3, i64* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i16, i16* %B, i64 2
+  %2 = load i16, i16* %arrayidx5, align 1
+  %conv6 = sext i16 %2 to i64
+  %arrayidx7 = getelementptr inbounds i64, i64* %A, i64 2
+  store i64 %conv6, i64* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i16, i16* %B, i64 3
+  %3 = load i16, i16* %arrayidx8, align 1
+  %conv9 = sext i16 %3 to i64
+  %arrayidx10 = getelementptr inbounds i64, i64* %A, i64 3
+  store i64 %conv9, i64* %arrayidx10, align 4
+  ret i64 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
new file mode 100644
index 00000000000..03db2af9239
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll
@@ -0,0 +1,283 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64--- -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX
+
+;
+; Check that we can commute operands based on the predicate.
+;
+
+define <4 x i32> @icmp_eq_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_eq_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp eq i32 %a0, %b0
+  %c1 = icmp eq i32 %b1, %a1
+  %c2 = icmp eq i32 %b2, %a2
+  %c3 = icmp eq i32 %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @icmp_ne_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_ne_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp ne i32 %a0, %b0
+  %c1 = icmp ne i32 %b1, %a1
+  %c2 = icmp ne i32 %b2, %a2
+  %c3 = icmp ne i32 %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_oeq_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_oeq_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp oeq float %a0, %b0
+  %c1 = fcmp oeq float %b1, %a1
+  %c2 = fcmp oeq float %b2, %a2
+  %c3 = fcmp oeq float %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_uno_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_uno_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp uno float %a0, %b0
+  %c1 = fcmp uno float %b1, %a1
+  %c2 = fcmp uno float %b2, %a2
+  %c3 = fcmp uno float %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+;
+; Check that we can commute operands by swapping the predicate.
+;
+
+define <4 x i32> @icmp_sgt_slt_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_sgt_slt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp sgt i32 %a0, %b0
+  %c1 = icmp slt i32 %b1, %a1
+  %c2 = icmp slt i32 %b2, %a2
+  %c3 = icmp sgt i32 %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @icmp_uge_ule_v4i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: @icmp_uge_ule_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <4 x i32> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %p0 = getelementptr inbounds i32, i32* %b, i32 0
+  %p1 = getelementptr inbounds i32, i32* %b, i32 1
+  %p2 = getelementptr inbounds i32, i32* %b, i32 2
+  %p3 = getelementptr inbounds i32, i32* %b, i32 3
+  %b0 = load i32, i32* %p0, align 4
+  %b1 = load i32, i32* %p1, align 4
+  %b2 = load i32, i32* %p2, align 4
+  %b3 = load i32, i32* %p3, align 4
+  %c0 = icmp uge i32 %a0, %b0
+  %c1 = icmp ule i32 %b1, %a1
+  %c2 = icmp ule i32 %b2, %a2
+  %c3 = icmp uge i32 %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_ogt_olt_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_ogt_olt_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <4 x float> [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp ogt float %a0, %b0
+  %c1 = fcmp olt float %b1, %a1
+  %c2 = fcmp olt float %b2, %a2
+  %c3 = fcmp ogt float %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) {
+; CHECK-LABEL: @fcmp_ord_uno_v4i32(
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[B0:%.*]] = load float, float* [[B]], align 4
+; CHECK-NEXT:    [[B1:%.*]] = load float, float* [[P1]], align 4
+; CHECK-NEXT:    [[B2:%.*]] = load float, float* [[P2]], align 4
+; CHECK-NEXT:    [[B3:%.*]] = load float, float* [[P3]], align 4
+; CHECK-NEXT:    [[C0:%.*]] = fcmp ord float [[A0]], [[B0]]
+; CHECK-NEXT:    [[C1:%.*]] = fcmp uno float [[B1]], [[A1]]
+; CHECK-NEXT:    [[C2:%.*]] = fcmp uno float [[B2]], [[A2]]
+; CHECK-NEXT:    [[C3:%.*]] = fcmp ord float [[A3]], [[B3]]
+; CHECK-NEXT:    [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0
+; CHECK-NEXT:    [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1
+; CHECK-NEXT:    [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2
+; CHECK-NEXT:    [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3
+; CHECK-NEXT:    [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %p0 = getelementptr inbounds float, float* %b, i32 0
+  %p1 = getelementptr inbounds float, float* %b, i32 1
+  %p2 = getelementptr inbounds float, float* %b, i32 2
+  %p3 = getelementptr inbounds float, float* %b, i32 3
+  %b0 = load float, float* %p0, align 4
+  %b1 = load float, float* %p1, align 4
+  %b2 = load float, float* %p2, align 4
+  %b3 = load float, float* %p3, align 4
+  %c0 = fcmp ord float %a0, %b0
+  %c1 = fcmp uno float %b1, %a1
+  %c2 = fcmp uno float %b2, %a2
+  %c3 = fcmp ord float %a3, %b3
+  %d0 = insertelement <4 x i1> undef, i1 %c0, i32 0
+  %d1 = insertelement <4 x i1>   %d0, i1 %c1, i32 1
+  %d2 = insertelement <4 x i1>   %d1, i1 %c2, i32 2
+  %d3 = insertelement <4 x i1>   %d2, i1 %c3, i32 3
+  %r = sext <4 x i1> %d3 to <4 x i32>
+  ret <4 x i32> %r
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_sel.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_sel.ll
new file mode 100644
index 00000000000..331c8e06466
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_sel.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+; int foo(double * restrict A, double * restrict B, double G) {
+;   A[0] = (B[10] ? G : 1);
+;   A[1] = (B[11] ? G : 1);
+; }
+
+define i32 @foo(double* noalias nocapture %A, double* noalias nocapture %B, double %G) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 10
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <2 x double> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[G:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[G]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP2]], <2 x double> [[TMP4]], <2 x double> <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %arrayidx = getelementptr inbounds double, double* %B, i64 10
+  %0 = load double, double* %arrayidx, align 8
+  %tobool = fcmp une double %0, 0.000000e+00
+  %cond = select i1 %tobool, double %G, double 1.000000e+00
+  store double %cond, double* %A, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %B, i64 11
+  %1 = load double, double* %arrayidx2, align 8
+  %tobool3 = fcmp une double %1, 0.000000e+00
+  %cond7 = select i1 %tobool3, double %G, double 1.000000e+00
+  %arrayidx8 = getelementptr inbounds double, double* %A, i64 1
+  store double %cond7, double* %arrayidx8, align 8
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
new file mode 100644
index 00000000000..ad566cb3411
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S | FileCheck %s
+
+; Verify that the SLP vectorizer is able to figure out that commutativity
+; offers the possibility to splat/broadcast %c and thus make it profitable
+; to vectorize this case
+
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+@cle = external unnamed_addr global [32 x i8], align 16
+@cle32 = external unnamed_addr global [32 x i32], align 16
+
+
+; Check that we correctly detect a splat/broadcast by leveraging the
+; commutativity property of `xor`.
+
+define void @splat(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
+; CHECK-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
+; CHECK-NEXT:    ret void
+;
+  %1 = xor i8 %c, %a
+  store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
+  %2 = xor i8 %a, %c
+  store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
+  %3 = xor i8 %a, %c
+  store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
+  %4 = xor i8 %a, %c
+  store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
+  %5 = xor i8 %c, %a
+  store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
+  %6 = xor i8 %c, %b
+  store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
+  %7 = xor i8 %c, %a
+  store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
+  %8 = xor i8 %c, %b
+  store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
+  %9 = xor i8 %a, %c
+  store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
+  %10 = xor i8 %a, %c
+  store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
+  %11 = xor i8 %a, %c
+  store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
+  %12 = xor i8 %a, %c
+  store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
+  %13 = xor i8 %a, %c
+  store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
+  %14 = xor i8 %a, %c
+  store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
+  %15 = xor i8 %a, %c
+  store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
+  %16 = xor i8 %a, %c
+  store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
+  ret void
+}
+
+
+
+; Check that we correctly detect that we can have the same opcode on one side by
+; leveraging the commutativity property of `xor`.
+
+define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @same_opcode_on_one_side(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
+; CHECK-NEXT:    ret void
+;
+  %add1 = add i32 %c, %a
+  %add2 = add i32 %c, %a
+  %add3 = add i32 %a, %c
+  %add4 = add i32 %c, %a
+  %1 = xor i32 %add1, %a
+  store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
+  %2 = xor i32 %b, %add2
+  store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
+  %3 = xor i32 %c, %add3
+  store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
+  %4 = xor i32 %a, %add4
+  store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
new file mode 100644
index 00000000000..c16ac538559
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@.str = private unnamed_addr constant [6 x i8] c"bingo\00", align 1
+
+define void @reduce_compare(double* nocapture %A, i32 %n) {
+; CHECK-LABEL: @reduce_compare(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 7.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 5.000000e+00, double 9.000000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0))
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sitofp i32 %n to double
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds double, double* %A, i64 %0
+  %1 = load double, double* %arrayidx, align 8
+  %mul1 = fmul double %conv, %1
+  %mul2 = fmul double %mul1, 7.000000e+00
+  %add = fadd double %mul2, 5.000000e+00
+  %2 = or i64 %0, 1
+  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %2
+  %3 = load double, double* %arrayidx6, align 8
+  %mul8 = fmul double %conv, %3
+  %mul9 = fmul double %mul8, 4.000000e+00
+  %add10 = fadd double %mul9, 9.000000e+00
+  %cmp11 = fcmp ogt double %add, %add10
+  br i1 %cmp11, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0))
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...)
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
new file mode 100644
index 00000000000..f394dc74397
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll
@@ -0,0 +1,556 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@A = common global [2000 x double] zeroinitializer, align 16
+@B = common global [2000 x double] zeroinitializer, align 16
+@C = common global [2000 x float] zeroinitializer, align 16
+@D = common global [2000 x float] zeroinitializer, align 16
+
+; Currently SCEV isn't smart enough to figure out that accesses
+; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but in future
+; that would hopefully be fixed. For now, check that this isn't
+; vectorized.
+; Function Attrs: nounwind ssp uwtable
+define void @foo_3double(i32 %u) #0 {
+; CHECK-LABEL: @foo_3double(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[U:%.*]], i32* [[U_ADDR]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[U]], 3
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store double [[ADD5]], double* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL]], 1
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX17]], align 8
+; CHECK-NEXT:    [[ADD18:%.*]] = fadd double [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store double [[ADD18]], double* [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[ADD24:%.*]] = add nsw i32 [[MUL]], 2
+; CHECK-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[ADD24]] to i64
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM25]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX26]], align 8
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM25]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, double* [[ARRAYIDX30]], align 8
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    store double [[ADD31]], double* [[ARRAYIDX26]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 3
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double, double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double, double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double, double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  %add24 = add nsw i32 %mul, 2
+  %idxprom25 = sext i32 %add24 to i64
+  %arrayidx26 = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom25
+  %4 = load double, double* %arrayidx26, align 8
+  %arrayidx30 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom25
+  %5 = load double, double* %arrayidx30, align 8
+  %add31 = fadd double %4, %5
+  store double %add31, double* %arrayidx26, align 8
+  ret void
+}
+
+; SCEV should be able to tell that accesses A[C1 + C2*i], A[C1 + C2*i], ...
+; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0.
+; Thus, the following code should be vectorized.
+; Function Attrs: nounwind ssp uwtable
+define void @foo_2double(i32 %u) #0 {
+; CHECK-LABEL: @foo_2double(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[U:%.*]], i32* [[U_ADDR]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[U]], 2
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL]], 1
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 2
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double, double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double, double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double, double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  ret void
+}
+
+; Similar to the previous test, but with different datatype.
+; Function Attrs: nounwind ssp uwtable
+define void @foo_4float(i32 %u) #0 {
+; CHECK-LABEL: @foo_4float(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[U:%.*]], i32* [[U_ADDR]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[U]], 4
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[MUL]], 1
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[ADD24:%.*]] = add nsw i32 [[MUL]], 2
+; CHECK-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[ADD24]] to i64
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM25]]
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM25]]
+; CHECK-NEXT:    [[ADD37:%.*]] = add nsw i32 [[MUL]], 3
+; CHECK-NEXT:    [[IDXPROM38:%.*]] = sext i32 [[ADD37]] to i64
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM38]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul nsw i32 %u, 4
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 %idxprom
+  %1 = load float, float* %arrayidx4, align 4
+  %add5 = fadd float %0, %1
+  store float %add5, float* %arrayidx, align 4
+  %add11 = add nsw i32 %mul, 1
+  %idxprom12 = sext i32 %add11 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 %idxprom12
+  %2 = load float, float* %arrayidx13, align 4
+  %arrayidx17 = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 %idxprom12
+  %3 = load float, float* %arrayidx17, align 4
+  %add18 = fadd float %2, %3
+  store float %add18, float* %arrayidx13, align 4
+  %add24 = add nsw i32 %mul, 2
+  %idxprom25 = sext i32 %add24 to i64
+  %arrayidx26 = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 %idxprom25
+  %4 = load float, float* %arrayidx26, align 4
+  %arrayidx30 = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 %idxprom25
+  %5 = load float, float* %arrayidx30, align 4
+  %add31 = fadd float %4, %5
+  store float %add31, float* %arrayidx26, align 4
+  %add37 = add nsw i32 %mul, 3
+  %idxprom38 = sext i32 %add37 to i64
+  %arrayidx39 = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 %idxprom38
+  %6 = load float, float* %arrayidx39, align 4
+  %arrayidx43 = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 %idxprom38
+  %7 = load float, float* %arrayidx43, align 4
+  %add44 = fadd float %6, %7
+  store float %add44, float* %arrayidx39, align 4
+  ret void
+}
+
+; Similar to the previous tests, but now we are dealing with AddRec SCEV.
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo_loop(double* %A, i32 %n) #0 {
+; CHECK-LABEL: @foo_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 8
+; CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[SUM:%.*]] = alloca double, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store double* [[A:%.*]], double** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[N:%.*]], i32* [[N_ADDR]], align 4
+; CHECK-NEXT:    store double 0.000000e+00, double* [[SUM]], align 8
+; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 0, [[N]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 2
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], 1
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 7.000000e+00, double 7.000000e+00>, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[ADD7]] = fadd double [[TMP1]], [[ADD6]]
+; CHECK-NEXT:    store double [[ADD7]], double* [[SUM]], align 8
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[TMP0]], 1
+; CHECK-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[SPLIT:%.*]] = phi double [ [[ADD7]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi double [ [[SPLIT]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[DOTLCSSA]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %n.addr = alloca i32, align 4
+  %sum = alloca double, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store double 0.000000e+00, double* %sum, align 8
+  store i32 0, i32* %i, align 4
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ]
+  %mul = mul nsw i32 %0, 2
+  %idxprom = sext i32 %mul to i64
+  %arrayidx = getelementptr inbounds double, double* %A, i64 %idxprom
+  %2 = load double, double* %arrayidx, align 8
+  %mul1 = fmul double 7.000000e+00, %2
+  %add = add nsw i32 %mul, 1
+  %idxprom3 = sext i32 %add to i64
+  %arrayidx4 = getelementptr inbounds double, double* %A, i64 %idxprom3
+  %3 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double 7.000000e+00, %3
+  %add6 = fadd double %mul1, %mul5
+  %add7 = fadd double %1, %add6
+  store double %add7, double* %sum, align 8
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %i, align 4
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %split = phi double [ %add7, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ]
+  %conv = fptosi double %.lcssa to i32
+  ret i32 %conv
+}
+
+; Similar to foo_2double but with a non-power-of-2 factor and potential
+; wrapping (both indices wrap or both don't in the same time)
+; Function Attrs: nounwind ssp uwtable
+define void @foo_2double_non_power_of_2(i32 %u) #0 {
+; CHECK-LABEL: @foo_2double_non_power_of_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[U:%.*]], i32* [[U_ADDR]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[U]], 6
+; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[MUL]], 6
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[ADD6]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[MUL]], 7
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[ADD7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul i32 %u, 6
+  %add6 = add i32 %mul, 6
+  %idxprom = sext i32 %add6 to i64
+  %arrayidx = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double, double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add7 = add i32 %mul, 7
+  %idxprom12 = sext i32 %add7 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double, double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double, double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  ret void
+}
+
+; Similar to foo_2double_non_power_of_2 but with zext's instead of sext's
+; Function Attrs: nounwind ssp uwtable
+define void @foo_2double_non_power_of_2_zext(i32 %u) #0 {
+; CHECK-LABEL: @foo_2double_non_power_of_2_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[U:%.*]], i32* [[U_ADDR]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[U]], 6
+; CHECK-NEXT:    [[ADD6:%.*]] = add i32 [[MUL]], 6
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[ADD6]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD7:%.*]] = add i32 [[MUL]], 7
+; CHECK-NEXT:    [[IDXPROM12:%.*]] = zext i32 [[ADD7]] to i64
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %u.addr = alloca i32, align 4
+  store i32 %u, i32* %u.addr, align 4
+  %mul = mul i32 %u, 6
+  %add6 = add i32 %mul, 6
+  %idxprom = zext i32 %add6 to i64
+  %arrayidx = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx4 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom
+  %1 = load double, double* %arrayidx4, align 8
+  %add5 = fadd double %0, %1
+  store double %add5, double* %arrayidx, align 8
+  %add7 = add i32 %mul, 7
+  %idxprom12 = zext i32 %add7 to i64
+  %arrayidx13 = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 %idxprom12
+  %2 = load double, double* %arrayidx13, align 8
+  %arrayidx17 = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 %idxprom12
+  %3 = load double, double* %arrayidx17, align 8
+  %add18 = fadd double %2, %3
+  store double %add18, double* %arrayidx13, align 8
+  ret void
+}
+
+; Similar to foo_2double_non_power_of_2, but now we are dealing with AddRec SCEV.
+; Alternatively, this is like foo_loop, but with a non-power-of-2 factor and
+; potential wrapping (both indices wrap or both don't in the same time)
+; Function Attrs: nounwind ssp uwtable
+define i32 @foo_loop_non_power_of_2(double* %A, i32 %n) #0 {
+; CHECK-LABEL: @foo_loop_non_power_of_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca double*, align 8
+; CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[SUM:%.*]] = alloca double, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store double* [[A:%.*]], double** [[A_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[N:%.*]], i32* [[N_ADDR]], align 4
+; CHECK-NEXT:    store double 0.000000e+00, double* [[SUM]], align 8
+; CHECK-NEXT:    store i32 0, i32* [[I]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 0, [[N]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP0]], 12
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[MUL]], 5
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[ADD_5]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[MUL]], 6
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[ADD_6]] to i64
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 7.000000e+00, double 7.000000e+00>, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[ADD7]] = fadd double [[TMP1]], [[ADD6]]
+; CHECK-NEXT:    store double [[ADD7]], double* [[SUM]], align 8
+; CHECK-NEXT:    [[INC]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    store i32 [[INC]], i32* [[I]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[SPLIT:%.*]] = phi double [ [[ADD7]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi double [ [[SPLIT]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[DOTLCSSA]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %A.addr = alloca double*, align 8
+  %n.addr = alloca i32, align 4
+  %sum = alloca double, align 8
+  %i = alloca i32, align 4
+  store double* %A, double** %A.addr, align 8
+  store i32 %n, i32* %n.addr, align 4
+  store double 0.000000e+00, double* %sum, align 8
+  store i32 0, i32* %i, align 4
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %0 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %1 = phi double [ 0.000000e+00, %for.body.lr.ph ], [ %add7, %for.body ]
+  %mul = mul i32 %0, 12
+  %add.5 = add i32 %mul, 5
+  %idxprom = sext i32 %add.5 to i64
+  %arrayidx = getelementptr inbounds double, double* %A, i64 %idxprom
+  %2 = load double, double* %arrayidx, align 8
+  %mul1 = fmul double 7.000000e+00, %2
+  %add.6 = add i32 %mul, 6
+  %idxprom3 = sext i32 %add.6 to i64
+  %arrayidx4 = getelementptr inbounds double, double* %A, i64 %idxprom3
+  %3 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double 7.000000e+00, %3
+  %add6 = fadd double %mul1, %mul5
+  %add7 = fadd double %1, %add6
+  store double %add7, double* %sum, align 8
+  %inc = add i32 %0, 1
+  store i32 %inc, i32* %i, align 4
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %split = phi double [ %add7, %for.body ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %.lcssa = phi double [ %split, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry ]
+  %conv = fptosi double %.lcssa to i32
+  ret i32 %conv
+}
+
+; This is generated by `clang -std=c11 -Wpedantic -Wall -O3 main.c -S -o - -emit-llvm`
+; with !{!"clang version 7.0.0 (trunk 337339) (llvm/trunk 337344)"} and stripping off
+; the !tbaa metadata nodes to fit the rest of the test file, where `cat main.c` is:
+;
+;  double bar(double *a, unsigned n) {
+;    double x = 0.0;
+;    double y = 0.0;
+;    for (unsigned i = 0; i < n; i += 2) {
+;      x += a[i];
+;      y += a[i + 1];
+;    }
+;    return x * y;
+;  }
+;
+; The resulting IR is similar to @foo_loop, but with zext's instead of sext's.
+;
+; Make sure we are able to vectorize this from now on:
+;
+define double @bar(double* nocapture readonly %a, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret double [[MUL]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x double> [ [[TMP6]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ADD1:%.*]] = or i32 [[I_018]], 1
+; CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[ADD1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[ADD5]] = add i32 [[I_018]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
+;
+entry:
+  %cmp15 = icmp eq i32 %n, 0
+  br i1 %cmp15, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %x.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %y.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add4, %for.body ]
+  %mul = fmul double %x.0.lcssa, %y.0.lcssa
+  ret double %mul
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.018 = phi i32 [ %add5, %for.body ], [ 0, %entry ]
+  %y.017 = phi double [ %add4, %for.body ], [ 0.000000e+00, %entry ]
+  %x.016 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %idxprom = zext i32 %i.018 to i64
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  %add = fadd double %x.016, %0
+  %add1 = or i32 %i.018, 1
+  %idxprom2 = zext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 %idxprom2
+  %1 = load double, double* %arrayidx3, align 8
+  %add4 = fadd double %y.017, %1
+  %add5 = add i32 %i.018, 2
+  %cmp = icmp ult i32 %add5, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.5.0 "}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
new file mode 100644
index 00000000000..060cb054b7f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; We will keep trying to vectorize the basic block even we already find vectorized store.
+define void @test1(double* %a, double* %b, double* %c, double* %d) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <4 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[D:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  %0 = bitcast double* %a to <4 x i32>*
+  %1 = load <4 x i32>, <4 x i32>* %0, align 8
+  %2 = bitcast double* %b to <4 x i32>*
+  %3 = load <4 x i32>, <4 x i32>* %2, align 8
+  %4 = mul <4 x i32> %1, %3
+  %5 = bitcast double* %d to <4 x i32>*
+  store <4 x i32> %4, <4 x i32>* %5, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
new file mode 100644
index 00000000000..e7bff49c09f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334 = type { %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333, i16*, i8*, i8*, i32, i32, i64, i64, i32, i32, i32, [4 x i32], i32, i32, i32, i32, i32, [20 x i8] }
+%struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333 = type { i32, i32, i32, i32 }
+
+define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p) {
+; CHECK-LABEL: @LzmaDec_DecodeReal2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P:%.*]], i64 0, i32 4
+; CHECK-NEXT:    [[CODE21_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P]], i64 0, i32 5
+; CHECK-NEXT:    br label [[DO_BODY66_I:%.*]]
+; CHECK:       do.body66.i:
+; CHECK-NEXT:    [[RANGE_2_I:%.*]] = phi i32 [ [[RANGE_4_I:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CODE_2_I:%.*]] = phi i32 [ [[CODE_4_I:%.*]], [[DO_COND_I]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT:    [[DOTRANGE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_2_I]]
+; CHECK-NEXT:    [[DOTCODE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_2_I]]
+; CHECK-NEXT:    br i1 undef, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]]
+; CHECK:       if.else.i:
+; CHECK-NEXT:    [[SUB91_I:%.*]] = sub i32 [[DOTRANGE_2_I]], undef
+; CHECK-NEXT:    [[SUB92_I:%.*]] = sub i32 [[DOTCODE_2_I]], undef
+; CHECK-NEXT:    br label [[DO_COND_I]]
+; CHECK:       do.cond.i:
+; CHECK-NEXT:    [[RANGE_4_I]] = phi i32 [ [[SUB91_I]], [[IF_ELSE_I]] ], [ undef, [[DO_BODY66_I]] ]
+; CHECK-NEXT:    [[CODE_4_I]] = phi i32 [ [[SUB92_I]], [[IF_ELSE_I]] ], [ [[DOTCODE_2_I]], [[DO_BODY66_I]] ]
+; CHECK-NEXT:    br i1 undef, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]]
+; CHECK:       do.end1006.i:
+; CHECK-NEXT:    [[DOTRANGE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_4_I]]
+; CHECK-NEXT:    [[DOTCODE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_4_I]]
+; CHECK-NEXT:    store i32 [[DOTRANGE_4_I]], i32* [[RANGE20_I]], align 4
+; CHECK-NEXT:    store i32 [[DOTCODE_4_I]], i32* [[CODE21_I]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %range20.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334, %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 4
+  %code21.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334, %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 5
+  br label %do.body66.i
+
+do.body66.i:                                      ; preds = %do.cond.i, %entry
+  %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ]
+  %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ]
+  %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i
+  %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i
+  br i1 undef, label %do.cond.i, label %if.else.i
+
+if.else.i:                                        ; preds = %do.body66.i
+  %sub91.i = sub i32 %.range.2.i, undef
+  %sub92.i = sub i32 %.code.2.i, undef
+  br label %do.cond.i
+
+do.cond.i:                                        ; preds = %if.else.i, %do.body66.i
+  %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ]
+  %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ]
+  br i1 undef, label %do.body66.i, label %do.end1006.i
+
+do.end1006.i:                                     ; preds = %do.cond.i
+  %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i
+  %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i
+  store i32 %.range.4.i, i32* %range20.i, align 4
+  store i32 %.code.4.i, i32* %code21.i, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll
new file mode 100644
index 00000000000..eec53739b67
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin13.3.0"
+
+@a = common global double 0.000000e+00, align 8
+
+define i32 @fn1() {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INIT:%.*]] = load double, double* @a, align 8
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PHI:%.*]] = phi double [ [[ADD2:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[POSTADD1_PHI:%.*]] = phi double [ [[POSTADD1:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY]] ]
+; CHECK-NEXT:    [[POSTADD2_PHI:%.*]] = phi double [ [[POSTADD2:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY]] ]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[POSTADD1_PHI]], undef
+; CHECK-NEXT:    [[ADD2]] = fadd double [[POSTADD2_PHI]], [[PHI]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul double [[ADD2]], 0.000000e+00
+; CHECK-NEXT:    [[BINARYOP_B:%.*]] = fadd double [[POSTADD1_PHI]], [[MUL2]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul double [[ADD1]], 0.000000e+00
+; CHECK-NEXT:    [[TMP:%.*]] = fadd double [[POSTADD2_PHI]], 0.000000e+00
+; CHECK-NEXT:    [[BINARY_V:%.*]] = fadd double [[MUL1]], [[BINARYOP_B]]
+; CHECK-NEXT:    [[POSTADD1]] = fadd double [[BINARY_V]], 0.000000e+00
+; CHECK-NEXT:    [[POSTADD2]] = fadd double [[TMP]], 1.000000e+00
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une double [[POSTADD1]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %init = load double, double* @a, align 8
+  br label %loop
+
+loop:
+  %phi = phi double [ %add2, %loop ], [ %init, %entry ]
+  %postadd1_phi = phi double [ %postadd1, %loop ], [ %init, %entry ]
+  %postadd2_phi = phi double [ %postadd2, %loop ], [ %init, %entry ]
+  %add1 = fadd double %postadd1_phi, undef
+  %add2 = fadd double %postadd2_phi, %phi
+  %mul2 = fmul double %add2, 0.000000e+00
+  %binaryop_B = fadd double %postadd1_phi, %mul2
+  %mul1 = fmul double %add1, 0.000000e+00
+  %tmp = fadd double %postadd2_phi, 0.000000e+00
+
+  ; tryToVectorize() starts with this binary instruction.
+  ; At the same time vectorization wraps around the loop, vectorizes
+  ; postadd1/2 and eventually binary_V and tmp. So binary_V itself is replaced
+  ; with a vector instruction.
+  ; The SLPVectorizer crashed because it tried to use binary_V
+  ; after vectorization to re-arrange instructions.
+  %binary_V = fadd double %mul1, %binaryop_B
+
+  %postadd1 = fadd double %binary_V, 0.000000e+00
+  %postadd2 = fadd double %tmp, 1.000000e+00
+  %tobool = fcmp une double %postadd1, 0.000000e+00
+  br i1 %tobool, label %exit, label %loop
+
+exit:
+  ret i32 1
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
new file mode 100644
index 00000000000..a1a3f50765a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960" = type { i32, i32 }
+
+define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* nocapture %info) {
+; CHECK-LABEL: @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    ret void
+; CHECK:       if.else:
+; CHECK-NEXT:    [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1
+; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]]
+; CHECK:       land.lhs.true.i.1:
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]]
+; CHECK:       if.then7.1:
+; CHECK-NEXT:    [[INC_1:%.*]] = add nsw i32 0, 1
+; CHECK-NEXT:    store i32 [[INC_1]], i32* [[M_NUMCONSTRAINTROWS4]], align 4
+; CHECK-NEXT:    [[DEC_1:%.*]] = add nsw i32 6, -1
+; CHECK-NEXT:    store i32 [[DEC_1]], i32* [[NUB5]], align 4
+; CHECK-NEXT:    br label [[FOR_INC_1]]
+; CHECK:       for.inc.1:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[DEC_1]], [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[INC_1]], [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ]
+; CHECK-NEXT:    [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[INC_2]], i32* [[M_NUMCONSTRAINTROWS4]], align 4
+; CHECK-NEXT:    [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    store i32 [[DEC_2]], i32* [[NUB5]], align 4
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br i1 undef, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.else:                                          ; preds = %entry
+  %m_numConstraintRows4 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 0
+  %nub5 = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* %info, i64 0, i32 1
+  br i1 undef, label %land.lhs.true.i.1, label %if.then7.1
+
+land.lhs.true.i.1:                                ; preds = %if.else
+  br i1 undef, label %for.inc.1, label %if.then7.1
+
+if.then7.1:                                       ; preds = %land.lhs.true.i.1, %if.else
+  %inc.1 = add nsw i32 0, 1
+  store i32 %inc.1, i32* %m_numConstraintRows4, align 4
+  %dec.1 = add nsw i32 6, -1
+  store i32 %dec.1, i32* %nub5, align 4
+  br label %for.inc.1
+
+for.inc.1:                                        ; preds = %if.then7.1, %land.lhs.true.i.1
+  %0 = phi i32 [ %dec.1, %if.then7.1 ], [ 6, %land.lhs.true.i.1 ]
+  %1 = phi i32 [ %inc.1, %if.then7.1 ], [ 0, %land.lhs.true.i.1 ]
+  %inc.2 = add nsw i32 %1, 1
+  store i32 %inc.2, i32* %m_numConstraintRows4, align 4
+  %dec.2 = add nsw i32 %0, -1
+  store i32 %dec.2, i32* %nub5, align 4
+  unreachable
+}
+
+%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332 = type { float, [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [3 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector4.7.32.67.92.117.142.177.187.262.282.331, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, %class.btVector3.5.30.65.90.115.140.175.185.260.280.330, [4 x float], float, float, [4 x float], float, float, [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330], [16 x %class.btVector3.5.30.65.90.115.140.175.185.260.280.330] }
+%class.btVector3.5.30.65.90.115.140.175.185.260.280.330 = type { [4 x float] }
+%class.btVector4.7.32.67.92.117.142.177.187.262.282.331 = type { %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 }
+
+define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this) {
+; CHECK-LABEL: @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332:%.*]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS:%.*]], i64 0, i32 2, i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS]], i64 0, i32 2, i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    [[ADD587:%.*]] = fadd float undef, undef
+; CHECK-NEXT:    [[SUB600:%.*]] = fsub float [[ADD587]], undef
+; CHECK-NEXT:    store float [[SUB600]], float* undef, align 4
+; CHECK-NEXT:    [[SUB613:%.*]] = fsub float [[ADD587]], [[SUB600]]
+; CHECK-NEXT:    store float [[SUB613]], float* [[ARRAYIDX26]], align 4
+; CHECK-NEXT:    [[ADD626:%.*]] = fadd float [[TMP0]], undef
+; CHECK-NEXT:    [[SUB639:%.*]] = fsub float [[ADD626]], undef
+; CHECK-NEXT:    [[SUB652:%.*]] = fsub float [[ADD626]], [[SUB639]]
+; CHECK-NEXT:    store float [[SUB652]], float* [[ARRAYIDX36]], align 4
+; CHECK-NEXT:    br i1 undef, label [[IF_ELSE1609:%.*]], label [[IF_THEN1595:%.*]]
+; CHECK:       if.then1595:
+; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[FOR_BODY_LR_PH_I_I1702:%.*]]
+; CHECK:       for.body.lr.ph.i.i1702:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.else1609:
+; CHECK-NEXT:    unreachable
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx26 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332, %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 1
+  %arrayidx36 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332, %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 2
+  %0 = load float, float* %arrayidx36, align 4
+  %add587 = fadd float undef, undef
+  %sub600 = fsub float %add587, undef
+  store float %sub600, float* undef, align 4
+  %sub613 = fsub float %add587, %sub600
+  store float %sub613, float* %arrayidx26, align 4
+  %add626 = fadd float %0, undef
+  %sub639 = fsub float %add626, undef
+  %sub652 = fsub float %add626, %sub639
+  store float %sub652, float* %arrayidx36, align 4
+  br i1 undef, label %if.else1609, label %if.then1595
+
+if.then1595:                                      ; preds = %entry
+  br i1 undef, label %return, label %for.body.lr.ph.i.i1702
+
+for.body.lr.ph.i.i1702:                           ; preds = %if.then1595
+  unreachable
+
+if.else1609:                                      ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %if.then1595
+  ret void
+}
+
+define void @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE() {
+; CHECK-LABEL: @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END111:%.*]]
+; CHECK:       if.end111:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END136:%.*]]
+; CHECK:       if.end136:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END162:%.*]]
+; CHECK:       if.end162:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END189:%.*]]
+; CHECK:       if.end189:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END216:%.*]]
+; CHECK:       if.end216:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN218:%.*]], label [[IF_END225:%.*]]
+; CHECK:       if.then218:
+; CHECK-NEXT:    br label [[IF_END225]]
+; CHECK:       if.end225:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END248:%.*]]
+; CHECK:       if.end248:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END304:%.*]]
+; CHECK:       if.end304:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END361:%.*]]
+; CHECK:       if.end361:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN370:%.*]], label [[IF_END395:%.*]]
+; CHECK:       if.then370:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN374:%.*]], label [[IF_END395]]
+; CHECK:       if.then374:
+; CHECK-NEXT:    br label [[IF_END395]]
+; CHECK:       if.end395:
+; CHECK-NEXT:    unreachable
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add8.i2343 = fadd float undef, undef
+  %add8.i2381 = fadd float undef, undef
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %return, label %if.end111
+
+if.end111:                                        ; preds = %if.end
+  br i1 undef, label %return, label %if.end136
+
+if.end136:                                        ; preds = %if.end111
+  br i1 undef, label %return, label %if.end162
+
+if.end162:                                        ; preds = %if.end136
+  br i1 undef, label %return, label %if.end189
+
+if.end189:                                        ; preds = %if.end162
+  br i1 undef, label %return, label %if.end216
+
+if.end216:                                        ; preds = %if.end189
+  br i1 undef, label %if.then218, label %if.end225
+
+if.then218:                                       ; preds = %if.end216
+  br label %if.end225
+
+if.end225:                                        ; preds = %if.then218, %if.end216
+  br i1 undef, label %return, label %if.end248
+
+if.end248:                                        ; preds = %if.end225
+  br i1 undef, label %return, label %if.end304
+
+if.end304:                                        ; preds = %if.end248
+  %mul341 = fmul float undef, %add8.i2343
+  %mul344 = fmul float undef, %add8.i2381
+  %sub345 = fsub float %mul341, %mul344
+  br i1 undef, label %return, label %if.end361
+
+if.end361:                                        ; preds = %if.end304
+  %mul364 = fmul float %add8.i2381, %add8.i2381
+  br i1 undef, label %if.then370, label %if.end395
+
+if.then370:                                       ; preds = %if.end361
+  br i1 undef, label %if.then374, label %if.end395
+
+if.then374:                                       ; preds = %if.then370
+  %cmp392 = fcmp olt float %sub345, 0.000000e+00
+  br label %if.end395
+
+if.end395:                                        ; preds = %if.then374, %if.then370, %if.end361
+  unreachable
+
+return:                                           ; preds = %if.end304, %if.end248, %if.end225, %if.end189, %if.end162, %if.end136, %if.end111, %if.end, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
new file mode 100644
index 00000000000..7ec4fc18598
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113 = type { [4 x float] }
+
+; Function Attrs: ssp uwtable
+define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(%class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices) #0 align 2 {
+; CHECK-LABEL: @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN17_1:%.*]], label [[IF_END22_1:%.*]]
+; CHECK:       for.end36:
+; CHECK-NEXT:    br label [[FOR_BODY144:%.*]]
+; CHECK:       for.body144:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END227:%.*]], label [[FOR_BODY144]]
+; CHECK:       for.end227:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END271:%.*]], label [[FOR_BODY233:%.*]]
+; CHECK:       for.body233:
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY233]], label [[FOR_END271]]
+; CHECK:       for.end271:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ]
+; CHECK-NEXT:    [[SUB275:%.*]] = fsub float undef, [[TMP1]]
+; CHECK-NEXT:    [[SUB279:%.*]] = fsub float undef, [[TMP0]]
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN291:%.*]], label [[RETURN]]
+; CHECK:       if.then291:
+; CHECK-NEXT:    [[MUL292:%.*]] = fmul float [[SUB275]], 5.000000e-01
+; CHECK-NEXT:    [[ADD294:%.*]] = fadd float [[TMP1]], [[MUL292]]
+; CHECK-NEXT:    [[MUL295:%.*]] = fmul float [[SUB279]], 5.000000e-01
+; CHECK-NEXT:    [[ADD297:%.*]] = fadd float [[TMP0]], [[MUL295]]
+; CHECK-NEXT:    br i1 undef, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]]
+; CHECK:       if.else319:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]]
+; CHECK:       if.then325:
+; CHECK-NEXT:    br label [[IF_END327]]
+; CHECK:       if.end327:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN329:%.*]], label [[IF_END332]]
+; CHECK:       if.then329:
+; CHECK-NEXT:    br label [[IF_END332]]
+; CHECK:       if.end332:
+; CHECK-NEXT:    [[DX272_1:%.*]] = phi float [ [[SUB275]], [[IF_THEN329]] ], [ [[SUB275]], [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ]
+; CHECK-NEXT:    [[DY276_1:%.*]] = phi float [ undef, [[IF_THEN329]] ], [ undef, [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ]
+; CHECK-NEXT:    [[SUB334:%.*]] = fsub float [[ADD294]], [[DX272_1]]
+; CHECK-NEXT:    [[SUB338:%.*]] = fsub float [[ADD297]], [[DY276_1]]
+; CHECK-NEXT:    [[ARRAYIDX_I_I606:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113:%.*]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES:%.*]], i64 0, i32 0, i64 0
+; CHECK-NEXT:    store float [[SUB334]], float* [[ARRAYIDX_I_I606]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3_I607:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    store float [[SUB338]], float* [[ARRAYIDX3_I607]], align 4
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+; CHECK:       if.then17.1:
+; CHECK-NEXT:    br label [[IF_END22_1]]
+; CHECK:       if.end22.1:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN17_2:%.*]], label [[IF_END22_2:%.*]]
+; CHECK:       if.then17.2:
+; CHECK-NEXT:    br label [[IF_END22_2]]
+; CHECK:       if.end22.2:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END36:%.*]], label [[FOR_BODY]]
+;
+entry:
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %if.end22.2, %if.end
+  br i1 undef, label %if.then17.1, label %if.end22.1
+
+for.end36:                                        ; preds = %if.end22.2
+  br label %for.body144
+
+for.body144:                                      ; preds = %for.body144, %for.end36
+  br i1 undef, label %for.end227, label %for.body144
+
+for.end227:                                       ; preds = %for.body144
+  br i1 undef, label %for.end271, label %for.body233
+
+for.body233:                                      ; preds = %for.body233, %for.end227
+  br i1 undef, label %for.body233, label %for.end271
+
+for.end271:                                       ; preds = %for.body233, %for.end227
+  %0 = phi float [ 0x47EFFFFFE0000000, %for.end227 ], [ undef, %for.body233 ]
+  %1 = phi float [ 0x47EFFFFFE0000000, %for.end227 ], [ undef, %for.body233 ]
+  %sub275 = fsub float undef, %1
+  %sub279 = fsub float undef, %0
+  br i1 undef, label %if.then291, label %return
+
+if.then291:                                       ; preds = %for.end271
+  %mul292 = fmul float %sub275, 5.000000e-01
+  %add294 = fadd float %1, %mul292
+  %mul295 = fmul float %sub279, 5.000000e-01
+  %add297 = fadd float %0, %mul295
+  br i1 undef, label %if.end332, label %if.else319
+
+if.else319:                                       ; preds = %if.then291
+  br i1 undef, label %if.then325, label %if.end327
+
+if.then325:                                       ; preds = %if.else319
+  br label %if.end327
+
+if.end327:                                        ; preds = %if.then325, %if.else319
+  br i1 undef, label %if.then329, label %if.end332
+
+if.then329:                                       ; preds = %if.end327
+  br label %if.end332
+
+if.end332:                                        ; preds = %if.then329, %if.end327, %if.then291
+  %dx272.1 = phi float [ %sub275, %if.then329 ], [ %sub275, %if.end327 ], [ 0x3F847AE140000000, %if.then291 ]
+  %dy276.1 = phi float [ undef, %if.then329 ], [ undef, %if.end327 ], [ 0x3F847AE140000000, %if.then291 ]
+  %sub334 = fsub float %add294, %dx272.1
+  %sub338 = fsub float %add297, %dy276.1
+  %arrayidx.i.i606 = getelementptr inbounds %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113, %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices, i64 0, i32 0, i64 0
+  store float %sub334, float* %arrayidx.i.i606, align 4
+  %arrayidx3.i607 = getelementptr inbounds %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113, %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices, i64 0, i32 0, i64 1
+  store float %sub338, float* %arrayidx3.i607, align 4
+  br label %return
+
+return:                                           ; preds = %if.end332, %for.end271, %entry
+  ret void
+
+if.then17.1:                                      ; preds = %for.body
+  br label %if.end22.1
+
+if.end22.1:                                       ; preds = %if.then17.1, %for.body
+  br i1 undef, label %if.then17.2, label %if.end22.2
+
+if.then17.2:                                      ; preds = %if.end22.1
+  br label %if.end22.2
+
+if.end22.2:                                       ; preds = %if.then17.2, %if.end22.1
+  br i1 undef, label %for.end36, label %for.body
+}
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
new file mode 100644
index 00000000000..550b8314d60
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mattr=+avx | FileCheck %s -check-prefixes=CHECK,AVX
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
+; SSE-LABEL: @testfunc(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    br label [[FOR_BODY:%.*]]
+; SSE:       for.body:
+; SSE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[S1_055:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I40:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[S0_054:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[COND_I44:%.*]], [[FOR_BODY]] ]
+; SSE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
+; SSE-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; SSE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; SSE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
+; SSE-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
+; SSE-NEXT:    [[ADD:%.*]] = fadd float [[S0_054]], [[TMP0]]
+; SSE-NEXT:    [[ADD3:%.*]] = fadd float [[S1_055]], [[TMP0]]
+; SSE-NEXT:    [[MUL:%.*]] = fmul float [[S0_054]], 0.000000e+00
+; SSE-NEXT:    [[ADD4:%.*]] = fadd float [[MUL]], [[ADD3]]
+; SSE-NEXT:    [[MUL5:%.*]] = fmul float [[S1_055]], 0.000000e+00
+; SSE-NEXT:    [[ADD6:%.*]] = fadd float [[MUL5]], [[ADD]]
+; SSE-NEXT:    [[CMP_I:%.*]] = fcmp olt float [[ADD6]], 1.000000e+00
+; SSE-NEXT:    [[COND_I:%.*]] = select i1 [[CMP_I]], float [[ADD6]], float 1.000000e+00
+; SSE-NEXT:    [[CMP_I51:%.*]] = fcmp olt float [[COND_I]], -1.000000e+00
+; SSE-NEXT:    [[CMP_I49:%.*]] = fcmp olt float [[ADD4]], 1.000000e+00
+; SSE-NEXT:    [[COND_I50:%.*]] = select i1 [[CMP_I49]], float [[ADD4]], float 1.000000e+00
+; SSE-NEXT:    [[CMP_I47:%.*]] = fcmp olt float [[COND_I50]], -1.000000e+00
+; SSE-NEXT:    [[COND_I_OP:%.*]] = fmul float [[COND_I]], 0.000000e+00
+; SSE-NEXT:    [[MUL10:%.*]] = select i1 [[CMP_I51]], float -0.000000e+00, float [[COND_I_OP]]
+; SSE-NEXT:    [[COND_I50_OP:%.*]] = fmul float [[COND_I50]], 0.000000e+00
+; SSE-NEXT:    [[MUL11:%.*]] = select i1 [[CMP_I47]], float -0.000000e+00, float [[COND_I50_OP]]
+; SSE-NEXT:    [[ADD13]] = fadd float [[MUL10]], [[MUL11]]
+; SSE-NEXT:    [[CMP_I45:%.*]] = fcmp olt float [[ADD13]], 1.000000e+00
+; SSE-NEXT:    [[COND_I46:%.*]] = select i1 [[CMP_I45]], float [[ADD13]], float 1.000000e+00
+; SSE-NEXT:    [[CMP_I43:%.*]] = fcmp olt float [[COND_I46]], -1.000000e+00
+; SSE-NEXT:    [[COND_I44]] = select i1 [[CMP_I43]], float -1.000000e+00, float [[COND_I46]]
+; SSE-NEXT:    [[CMP_I41:%.*]] = fcmp olt float [[MUL11]], 1.000000e+00
+; SSE-NEXT:    [[COND_I42:%.*]] = select i1 [[CMP_I41]], float [[MUL11]], float 1.000000e+00
+; SSE-NEXT:    [[CMP_I39:%.*]] = fcmp olt float [[COND_I42]], -1.000000e+00
+; SSE-NEXT:    [[COND_I40]] = select i1 [[CMP_I39]], float -1.000000e+00, float [[COND_I42]]
+; SSE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
+; SSE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; SSE:       for.end:
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @testfunc(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    br label [[FOR_BODY:%.*]]
+; AVX:       for.body:
+; AVX-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; AVX-NEXT:    [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ]
+; AVX-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ]
+; AVX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; AVX-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]]
+; AVX-NEXT:    store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; AVX-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
+; AVX-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
+; AVX-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]]
+; AVX-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
+; AVX-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
+; AVX-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
+; AVX-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
+; AVX-NEXT:    [[ADD13]] = fadd float [[TMP16]], [[TMP17]]
+; AVX-NEXT:    [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0
+; AVX-NEXT:    [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1
+; AVX-NEXT:    [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; AVX-NEXT:    [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], <float -1.000000e+00, float -1.000000e+00>
+; AVX-NEXT:    [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> <float -1.000000e+00, float -1.000000e+00>, <2 x float> [[TMP21]]
+; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; AVX:       for.end:
+; AVX-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %acc1.056 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
+  %s1.055 = phi float [ 0.000000e+00, %entry ], [ %cond.i40, %for.body ]
+  %s0.054 = phi float [ 0.000000e+00, %entry ], [ %cond.i44, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %src, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds float, float* %dest, i64 %indvars.iv
+  store float %acc1.056, float* %arrayidx2, align 4
+  %add = fadd float %s0.054, %0
+  %add3 = fadd float %s1.055, %0
+  %mul = fmul float %s0.054, 0.000000e+00
+  %add4 = fadd float %mul, %add3
+  %mul5 = fmul float %s1.055, 0.000000e+00
+  %add6 = fadd float %mul5, %add
+  %cmp.i = fcmp olt float %add6, 1.000000e+00
+  %cond.i = select i1 %cmp.i, float %add6, float 1.000000e+00
+  %cmp.i51 = fcmp olt float %cond.i, -1.000000e+00
+  %cmp.i49 = fcmp olt float %add4, 1.000000e+00
+  %cond.i50 = select i1 %cmp.i49, float %add4, float 1.000000e+00
+  %cmp.i47 = fcmp olt float %cond.i50, -1.000000e+00
+  %cond.i.op = fmul float %cond.i, 0.000000e+00
+  %mul10 = select i1 %cmp.i51, float -0.000000e+00, float %cond.i.op
+  %cond.i50.op = fmul float %cond.i50, 0.000000e+00
+  %mul11 = select i1 %cmp.i47, float -0.000000e+00, float %cond.i50.op
+  %add13 = fadd float %mul10, %mul11
+
+  ; The SLPVectorizer crashed in vectorizeChainsInBlock() because it tried
+  ; to access the second operand of the following cmp after the cmp itself
+  ; was already vectorized and deleted.
+  %cmp.i45 = fcmp olt float %add13, 1.000000e+00
+
+  %cond.i46 = select i1 %cmp.i45, float %add13, float 1.000000e+00
+  %cmp.i43 = fcmp olt float %cond.i46, -1.000000e+00
+  %cond.i44 = select i1 %cmp.i43, float -1.000000e+00, float %cond.i46
+  %cmp.i41 = fcmp olt float %mul11, 1.000000e+00
+  %cond.i42 = select i1 %cmp.i41, float %mul11, float 1.000000e+00
+  %cmp.i39 = fcmp olt float %cond.i42, -1.000000e+00
+  %cond.i40 = select i1 %cmp.i39, float -1.000000e+00, float %cond.i42
+  %exitcond = icmp eq i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
new file mode 100644
index 00000000000..b5f736af624
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731" = type { double*, double*, double*, double** }
+
+; Function Attrs: nounwind ssp uwtable
+define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* nocapture %__last) {
+; CHECK-LABEL: @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8
+; CHECK-NEXT:    [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1
+; CHECK-NEXT:    [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8
+; CHECK-NEXT:    [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load double*, double** [[_M_FIRST3_I_I83]], align 8
+; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]]
+; CHECK:       while.cond.i.preheader:
+; CHECK-NEXT:    br label [[WHILE_COND_I:%.*]]
+; CHECK:       while.cond.i:
+; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_BODY_I:%.*]]
+; CHECK:       while.body.i:
+; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]]
+; CHECK:       _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8
+; CHECK-NEXT:    store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]]
+; CHECK:       if.then.i55:
+; CHECK-NEXT:    br label [[WHILE_COND]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    br label [[WHILE_COND]]
+;
+entry:
+  %_M_cur2.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 0
+  %0 = load double*, double** %_M_cur2.i.i, align 8
+  %_M_first3.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 1
+  %_M_cur2.i.i81 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 0
+  %1 = load double*, double** %_M_cur2.i.i81, align 8
+  %_M_first3.i.i83 = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__last, i64 0, i32 1
+  %2 = load double*, double** %_M_first3.i.i83, align 8
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i.preheader
+
+while.cond.i.preheader:                           ; preds = %entry
+  br label %while.cond.i
+
+while.cond.i:                                     ; preds = %while.body.i, %while.cond.i.preheader
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.body.i
+
+while.body.i:                                     ; preds = %while.cond.i
+  br i1 undef, label %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit, label %while.cond.i
+
+_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: ; preds = %while.body.i, %while.cond.i, %entry
+  %3 = phi double* [ %2, %entry ], [ %2, %while.cond.i ], [ undef, %while.body.i ]
+  %4 = phi double* [ %0, %entry ], [ %1, %while.cond.i ], [ undef, %while.body.i ]
+  store double* %4, double** %_M_cur2.i.i, align 8
+  store double* %3, double** %_M_first3.i.i, align 8
+  br i1 undef, label %if.then.i55, label %while.cond
+
+if.then.i55:                                      ; preds = %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %if.then.i55, %_ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit
+  br label %while.cond
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
new file mode 100644
index 00000000000..d149c27b7c7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @main() #0 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    unreachable
+; CHECK:       while.end:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END80:%.*]], label [[FOR_BODY75_LR_PH:%.*]]
+; CHECK:       for.body75.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY75:%.*]]
+; CHECK:       for.body75:
+; CHECK-NEXT:    br label [[FOR_BODY75]]
+; CHECK:       for.end80:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END300:%.*]], label [[FOR_BODY267_LR_PH:%.*]]
+; CHECK:       for.body267.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY267:%.*]]
+; CHECK:       for.body267:
+; CHECK-NEXT:    [[S_71010:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY267_LR_PH]] ], [ [[ADD297:%.*]], [[FOR_BODY267]] ]
+; CHECK-NEXT:    [[MUL269:%.*]] = fmul double undef, undef
+; CHECK-NEXT:    [[MUL270:%.*]] = fmul double [[MUL269]], [[MUL269]]
+; CHECK-NEXT:    [[ADD282:%.*]] = fadd double undef, undef
+; CHECK-NEXT:    [[MUL283:%.*]] = fmul double [[MUL269]], [[ADD282]]
+; CHECK-NEXT:    [[ADD293:%.*]] = fadd double undef, undef
+; CHECK-NEXT:    [[MUL294:%.*]] = fmul double [[MUL270]], [[ADD293]]
+; CHECK-NEXT:    [[ADD295:%.*]] = fadd double undef, [[MUL294]]
+; CHECK-NEXT:    [[DIV296:%.*]] = fdiv double [[MUL283]], [[ADD295]]
+; CHECK-NEXT:    [[ADD297]] = fadd double [[S_71010]], [[DIV296]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY267]], label [[FOR_END300]]
+; CHECK:       for.end300:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %entry
+  unreachable
+
+while.end:                                        ; preds = %entry
+  br i1 undef, label %for.end80, label %for.body75.lr.ph
+
+for.body75.lr.ph:                                 ; preds = %while.end
+  br label %for.body75
+
+for.body75:                                       ; preds = %for.body75, %for.body75.lr.ph
+  br label %for.body75
+
+for.end80:                                        ; preds = %while.end
+  br i1 undef, label %for.end300, label %for.body267.lr.ph
+
+for.body267.lr.ph:                                ; preds = %for.end80
+  br label %for.body267
+
+for.body267:                                      ; preds = %for.body267, %for.body267.lr.ph
+  %s.71010 = phi double [ 0.000000e+00, %for.body267.lr.ph ], [ %add297, %for.body267 ]
+  %mul269 = fmul double undef, undef
+  %mul270 = fmul double %mul269, %mul269
+  %add282 = fadd double undef, undef
+  %mul283 = fmul double %mul269, %add282
+  %add293 = fadd double undef, undef
+  %mul294 = fmul double %mul270, %add293
+  %add295 = fadd double undef, %mul294
+  %div296 = fdiv double %mul283, %add295
+  %add297 = fadd double %s.71010, %div296
+  br i1 undef, label %for.body267, label %for.end300
+
+for.end300:                                       ; preds = %for.body267, %for.end80
+  unreachable
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_gep.ll
new file mode 100644
index 00000000000..eca21e7d3a1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_gep.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common global i64* null, align 8
+
+; Function Attrs: nounwind uwtable
+define i32 @fn1() {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** @a, align 8
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint i64* [[ADD_PTR]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
+; CHECK-NEXT:    store i64 [[TMP1]], i64* [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint i64* [[ARRAYIDX]] to i64
+; CHECK-NEXT:    store i64 [[TMP2]], i64* [[ADD_PTR]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i64*, i64** @a, align 8
+  %add.ptr = getelementptr inbounds i64, i64* %0, i64 1
+  %1 = ptrtoint i64* %add.ptr to i64
+  %arrayidx = getelementptr inbounds i64, i64* %0, i64 2
+  store i64 %1, i64* %arrayidx, align 8
+  %2 = ptrtoint i64* %arrayidx to i64
+  store i64 %2, i64* %add.ptr, align 8
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
new file mode 100644
index 00000000000..53f61fe68ea
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @RCModelEstimator() {
+; CHECK-LABEL: @RCModelEstimator(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END_THREAD:%.*]]
+; CHECK:       for.end.thread:
+; CHECK-NEXT:    unreachable
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY3:%.*]], label [[IF_END103:%.*]]
+; CHECK:       for.cond14.preheader:
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY16_LR_PH:%.*]], label [[IF_END103]]
+; CHECK:       for.body16.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN7:%.*]], label [[FOR_INC11:%.*]]
+; CHECK:       if.then7:
+; CHECK-NEXT:    br label [[FOR_INC11]]
+; CHECK:       for.inc11:
+; CHECK-NEXT:    br i1 false, label [[FOR_COND14_PREHEADER:%.*]], label [[FOR_BODY3]]
+; CHECK:       for.body16:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END39:%.*]], label [[FOR_BODY16]]
+; CHECK:       for.end39:
+; CHECK-NEXT:    br i1 undef, label [[IF_END103]], label [[FOR_COND45_PREHEADER:%.*]]
+; CHECK:       for.cond45.preheader:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN88:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then88:
+; CHECK-NEXT:    br label [[IF_END103]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br label [[IF_END103]]
+; CHECK:       if.end103:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %for.body.lr.ph, label %for.end.thread
+
+for.end.thread:                                   ; preds = %entry
+  unreachable
+
+for.body.lr.ph:                                   ; preds = %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %for.body.lr.ph
+  br i1 undef, label %for.body3, label %if.end103
+
+for.cond14.preheader:                             ; preds = %for.inc11
+  br i1 undef, label %for.body16.lr.ph, label %if.end103
+
+for.body16.lr.ph:                                 ; preds = %for.cond14.preheader
+  br label %for.body16
+
+for.body3:                                        ; preds = %for.inc11, %for.end
+  br i1 undef, label %if.then7, label %for.inc11
+
+if.then7:                                         ; preds = %for.body3
+  br label %for.inc11
+
+for.inc11:                                        ; preds = %if.then7, %for.body3
+  br i1 false, label %for.cond14.preheader, label %for.body3
+
+for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
+  br i1 undef, label %for.end39, label %for.body16
+
+for.end39:                                        ; preds = %for.body16
+  br i1 undef, label %if.end103, label %for.cond45.preheader
+
+for.cond45.preheader:                             ; preds = %for.end39
+  br i1 undef, label %if.then88, label %if.else
+
+if.then88:                                        ; preds = %for.cond45.preheader
+  %mul89 = fmul double 0.000000e+00, 0.000000e+00
+  %mul90 = fmul double 0.000000e+00, 0.000000e+00
+  %sub91 = fsub double %mul89, %mul90
+  %div92 = fdiv double %sub91, undef
+  %mul94 = fmul double 0.000000e+00, 0.000000e+00
+  %mul95 = fmul double 0.000000e+00, 0.000000e+00
+  %sub96 = fsub double %mul94, %mul95
+  %div97 = fdiv double %sub96, undef
+  br label %if.end103
+
+if.else:                                          ; preds = %for.cond45.preheader
+  br label %if.end103
+
+if.end103:                                        ; preds = %if.else, %if.then88, %for.end39, %for.cond14.preheader, %for.end
+  %0 = phi double [ 0.000000e+00, %for.end39 ], [ %div97, %if.then88 ], [ 0.000000e+00, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ]
+  %1 = phi double [ undef, %for.end39 ], [ %div92, %if.then88 ], [ undef, %if.else ], [ 0.000000e+00, %for.cond14.preheader ], [ 0.000000e+00, %for.end ]
+  ret void
+}
+
+
+define void @intrapred_luma() {
+; CHECK-LABEL: @intrapred_luma(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV153:%.*]] = trunc i32 undef to i16
+; CHECK-NEXT:    [[ARRAYIDX154:%.*]] = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 12
+; CHECK-NEXT:    store i16 [[CONV153]], i16* [[ARRAYIDX154]], align 8
+; CHECK-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 11
+; CHECK-NEXT:    store i16 [[CONV153]], i16* [[ARRAYIDX155]], align 2
+; CHECK-NEXT:    [[ARRAYIDX156:%.*]] = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 10
+; CHECK-NEXT:    store i16 [[CONV153]], i16* [[ARRAYIDX156]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv153 = trunc i32 undef to i16
+  %arrayidx154 = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 12
+  store i16 %conv153, i16* %arrayidx154, align 8
+  %arrayidx155 = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 11
+  store i16 %conv153, i16* %arrayidx155, align 2
+  %arrayidx156 = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 10
+  store i16 %conv153, i16* %arrayidx156, align 4
+  ret void
+}
+
+define fastcc void @dct36(double* %inbuf) {
+; CHECK-LABEL: @dct36(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[INBUF]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx41 = getelementptr inbounds double, double* %inbuf, i64 2
+  %arrayidx44 = getelementptr inbounds double, double* %inbuf, i64 1
+  %0 = load double, double* %arrayidx44, align 8
+  %add46 = fadd double %0, undef
+  store double %add46, double* %arrayidx41, align 8
+  %1 = load double, double* %inbuf, align 8
+  %add49 = fadd double %1, %0
+  store double %add49, double* %arrayidx44, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
new file mode 100644
index 00000000000..8f57a820197
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @main() {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br label [[FOR_COND4_PREHEADER:%.*]]
+; CHECK:       for.cond4.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY6:%.*]]
+; CHECK:       for.body6:
+; CHECK-NEXT:    br label [[FOR_BODY12:%.*]]
+; CHECK:       for.body12:
+; CHECK-NEXT:    [[FZIMG_069:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD19:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[FZREAL_068:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD20:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul double [[FZREAL_068]], [[FZREAL_068]]
+; CHECK-NEXT:    [[MUL14:%.*]] = fmul double [[FZIMG_069]], [[FZIMG_069]]
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd double [[MUL13]], [[MUL14]]
+; CHECK-NEXT:    [[CMP16:%.*]] = fcmp ogt double [[ADD15]], 4.000000e+00
+; CHECK-NEXT:    br i1 [[CMP16]], label [[FOR_INC21:%.*]], label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[FZIMG_069]]
+; CHECK-NEXT:    [[ADD19]] = fadd double undef, [[MUL18]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]]
+; CHECK-NEXT:    [[ADD20]] = fadd double undef, [[SUB]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]]
+; CHECK:       for.inc21:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]]
+; CHECK:       for.end23:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN25:%.*]], label [[IF_THEN26:%.*]]
+; CHECK:       if.then25:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END44:%.*]], label [[FOR_COND4_PREHEADER]]
+; CHECK:       if.then26:
+; CHECK-NEXT:    unreachable
+; CHECK:       for.end44:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END48:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end48:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.end44, %entry
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %if.then25, %for.body
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.inc21, %for.cond4.preheader
+  br label %for.body12
+
+for.body12:                                       ; preds = %if.end, %for.body6
+  %fZImg.069 = phi double [ undef, %for.body6 ], [ %add19, %if.end ]
+  %fZReal.068 = phi double [ undef, %for.body6 ], [ %add20, %if.end ]
+  %mul13 = fmul double %fZReal.068, %fZReal.068
+  %mul14 = fmul double %fZImg.069, %fZImg.069
+  %add15 = fadd double %mul13, %mul14
+  %cmp16 = fcmp ogt double %add15, 4.000000e+00
+  br i1 %cmp16, label %for.inc21, label %if.end
+
+if.end:                                           ; preds = %for.body12
+  %mul18 = fmul double undef, %fZImg.069
+  %add19 = fadd double undef, %mul18
+  %sub = fsub double %mul13, %mul14
+  %add20 = fadd double undef, %sub
+  br i1 undef, label %for.body12, label %for.inc21
+
+for.inc21:                                        ; preds = %if.end, %for.body12
+  br i1 undef, label %for.end23, label %for.body6
+
+for.end23:                                        ; preds = %for.inc21
+  br i1 undef, label %if.then25, label %if.then26
+
+if.then25:                                        ; preds = %for.end23
+  br i1 undef, label %for.end44, label %for.cond4.preheader
+
+if.then26:                                        ; preds = %for.end23
+  unreachable
+
+for.end44:                                        ; preds = %if.then25
+  br i1 undef, label %for.end48, label %for.body
+
+for.end48:                                        ; preds = %for.end44
+  ret void
+}
+
+%struct.hoge = type { double, double, double}
+
+define void @zot(%struct.hoge* %arg) {
+; CHECK-LABEL: @zot(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = load double, double* undef, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* undef, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    br i1 undef, label [[BB11:%.*]], label [[BB12:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    br label [[BB14:%.*]]
+; CHECK:       bb12:
+; CHECK-NEXT:    br label [[BB14]]
+; CHECK:       bb14:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = load double, double* undef, align 8
+  %tmp1 = fsub double %tmp, undef
+  %tmp2 = load double, double* undef, align 8
+  %tmp3 = fsub double %tmp2, undef
+  %tmp4 = fmul double %tmp3, undef
+  %tmp5 = fmul double %tmp3, undef
+  %tmp6 = fsub double %tmp5, undef
+  %tmp7 = getelementptr inbounds %struct.hoge, %struct.hoge* %arg, i64 0, i32 1
+  store double %tmp6, double* %tmp7, align 8
+  %tmp8 = fmul double %tmp1, undef
+  %tmp9 = fsub double %tmp8, undef
+  %tmp10 = getelementptr inbounds %struct.hoge, %struct.hoge* %arg, i64 0, i32 2
+  store double %tmp9, double* %tmp10, align 8
+  br i1 undef, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb
+  br label %bb14
+
+bb12:                                             ; preds = %bb
+  %tmp13 = fmul double undef, %tmp2
+  br label %bb14
+
+bb14:                                             ; preds = %bb12, %bb11
+  ret void
+}
+
+
+%struct.rc4_state.0.24 = type { i32, i32, [256 x i32] }
+
+define void @rc4_crypt(%struct.rc4_state.0.24* nocapture %s) {
+; CHECK-LABEL: @rc4_crypt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = getelementptr inbounds [[STRUCT_RC4_STATE_0_24:%.*]], %struct.rc4_state.0.24* [[S:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[Y2:%.*]] = getelementptr inbounds [[STRUCT_RC4_STATE_0_24]], %struct.rc4_state.0.24* [[S]], i64 0, i32 1
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[CONV4:%.*]] = and i32 undef, 255
+; CHECK-NEXT:    [[CONV7:%.*]] = and i32 undef, 255
+; CHECK-NEXT:    br i1 undef, label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[X_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[CONV4]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[Y_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[CONV7]], [[FOR_BODY]] ]
+; CHECK-NEXT:    store i32 [[X_0_LCSSA]], i32* [[X1]], align 4
+; CHECK-NEXT:    store i32 [[Y_0_LCSSA]], i32* [[Y2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x1 = getelementptr inbounds %struct.rc4_state.0.24, %struct.rc4_state.0.24* %s, i64 0, i32 0
+  %y2 = getelementptr inbounds %struct.rc4_state.0.24, %struct.rc4_state.0.24* %s, i64 0, i32 1
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.045 = phi i32 [ %conv4, %for.body ], [ undef, %entry ]
+  %conv4 = and i32 undef, 255
+  %conv7 = and i32 undef, 255
+  %idxprom842 = zext i32 %conv7 to i64
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %x.0.lcssa = phi i32 [ undef, %entry ], [ %conv4, %for.body ]
+  %y.0.lcssa = phi i32 [ undef, %entry ], [ %conv7, %for.body ]
+  store i32 %x.0.lcssa, i32* %x1, align 4
+  store i32 %y.0.lcssa, i32* %y2, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
new file mode 100644
index 00000000000..a52ec6ba9de
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.DState = type { i32, i32 }
+
+@b = common global %struct.DState zeroinitializer, align 4
+@d = common global i32 0, align 4
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+define i32 @fn1() {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 0), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 1), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @d, align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[SW_BB:%.*]], label [[SAVE_STATE_AND_RETURN:%.*]]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP3]], 7
+; CHECK-NEXT:    store i32 [[AND]], i32* @a, align 4
+; CHECK-NEXT:    switch i32 [[AND]], label [[IF_END:%.*]] [
+; CHECK-NEXT:    i32 7, label [[SAVE_STATE_AND_RETURN]]
+; CHECK-NEXT:    i32 0, label [[SAVE_STATE_AND_RETURN]]
+; CHECK-NEXT:    ]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[SAVE_STATE_AND_RETURN]]
+; CHECK:       save_state_and_return:
+; CHECK-NEXT:    [[T_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP0]], [[SW_BB]] ], [ [[TMP0]], [[SW_BB]] ]
+; CHECK-NEXT:    [[F_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP1]], [[ENTRY]] ], [ 0, [[SW_BB]] ], [ 0, [[SW_BB]] ]
+; CHECK-NEXT:    store i32 [[T_0]], i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 0), align 4
+; CHECK-NEXT:    store i32 [[F_0]], i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 1), align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 0), align 4
+  %1 = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 1), align 4
+  %2 = load i32, i32* @d, align 4
+  %cond = icmp eq i32 %2, 0
+  br i1 %cond, label %sw.bb, label %save_state_and_return
+
+sw.bb:                                            ; preds = %entry
+  %3 = load i32, i32* @c, align 4
+  %and = and i32 %3, 7
+  store i32 %and, i32* @a, align 4
+  switch i32 %and, label %if.end [
+  i32 7, label %save_state_and_return
+  i32 0, label %save_state_and_return
+  ]
+
+if.end:                                           ; preds = %sw.bb
+  br label %save_state_and_return
+
+save_state_and_return:                            ; preds = %sw.bb, %sw.bb, %if.end, %entry
+  %t.0 = phi i32 [ 0, %if.end ], [ %0, %entry ], [ %0, %sw.bb ], [ %0, %sw.bb ]
+  %f.0 = phi i32 [ 0, %if.end ], [ %1, %entry ], [ 0, %sw.bb ], [ 0, %sw.bb ]
+  store i32 %t.0, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 0), align 4
+  store i32 %f.0, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 1), align 4
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
new file mode 100644
index 00000000000..9108e84a3a6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -disable-verify -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin13.3.0"
+
+define void @_foo(double %p1, double %p2, double %p3) #0 {
+; CHECK-LABEL: @_foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TAB1:%.*]] = alloca [256 x i32], align 16
+; CHECK-NEXT:    [[TAB2:%.*]] = alloca [256 x i32], align 16
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04
+; CHECK-NEXT:    [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
+; CHECK-NEXT:    [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[T_0259]], i32 0
+; CHECK-NEXT:    [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
+; CHECK-NEXT:    store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa !0
+; CHECK-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[P3_ADDR_0258]], i32 0
+; CHECK-NEXT:    [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
+; CHECK-NEXT:    store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa !0
+; CHECK-NEXT:    [[ADD27]] = fadd double [[MUL19]], [[T_0259]]
+; CHECK-NEXT:    [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tab1 = alloca [256 x i32], align 16
+  %tab2 = alloca [256 x i32], align 16
+  br label %bb1
+
+
+bb1:
+  %mul19 = fmul double %p1, 1.638400e+04
+  %mul20 = fmul double %p3, 1.638400e+04
+  %add = fadd double %mul20, 8.192000e+03
+  %mul21 = fmul double %p2, 1.638400e+04
+  ; The SLPVectorizer crashed when scheduling this block after it inserted an
+  ; insertelement instruction (during vectorizing the for.body block) at this position.
+  br label %for.body
+
+for.body:
+  %indvars.iv266 = phi i64 [ 0, %bb1 ], [ %indvars.iv.next267, %for.body ]
+  %t.0259 = phi double [ 0.000000e+00, %bb1 ], [ %add27, %for.body ]
+  %p3.addr.0258 = phi double [ %add, %bb1 ], [ %add28, %for.body ]
+  %vecinit.i.i237 = insertelement <2 x double> undef, double %t.0259, i32 0
+  %x13 = tail call i32 @_xfn(<2 x double> %vecinit.i.i237) #2
+  %arrayidx = getelementptr inbounds [256 x i32], [256 x i32]* %tab1, i64 0, i64 %indvars.iv266
+  store i32 %x13, i32* %arrayidx, align 4, !tbaa !4
+  %vecinit.i.i = insertelement <2 x double> undef, double %p3.addr.0258, i32 0
+  %x14 = tail call i32 @_xfn(<2 x double> %vecinit.i.i) #2
+  %arrayidx26 = getelementptr inbounds [256 x i32], [256 x i32]* %tab2, i64 0, i64 %indvars.iv266
+  store i32 %x14, i32* %arrayidx26, align 4, !tbaa !4
+  %add27 = fadd double %mul19, %t.0259
+  %add28 = fadd double %mul21, %p3.addr.0258
+  %indvars.iv.next267 = add nuw nsw i64 %indvars.iv266, 1
+  %exitcond = icmp eq i64 %indvars.iv.next267, 256
+  br i1 %exitcond, label %return, label %for.body
+
+return:
+  ret void
+}
+
+declare i32 @_xfn(<2 x double>) #4
+
+!3 = !{!"int", !4, i64 0}
+!4 = !{!3, !3, i64 0}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
new file mode 100644
index 00000000000..9a07cfc8431
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171 = type { i32, i32, i32, i32, i32, i32, [8 x i8] }
+
+define void @SIM4() {
+; CHECK-LABEL: @SIM4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK:       lor.lhs.false:
+; CHECK-NEXT:    br i1 undef, label [[RETURN]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END605:%.*]], label [[FOR_BODY_LR_PH:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC603:%.*]], label [[IF_END12:%.*]]
+; CHECK:       if.end12:
+; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE:%.*]], label [[LAND_LHS_TRUE167:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN17:%.*]], label [[LAND_LHS_TRUE167]]
+; CHECK:       if.then17:
+; CHECK-NEXT:    br i1 undef, label [[IF_END98:%.*]], label [[LAND_RHS_LR_PH:%.*]]
+; CHECK:       land.rhs.lr.ph:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end98:
+; CHECK-NEXT:    [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1
+; CHECK-NEXT:    br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]]
+; CHECK:       if.then103:
+; CHECK-NEXT:    [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef
+; CHECK-NEXT:    [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2
+; CHECK-NEXT:    [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0
+; CHECK-NEXT:    [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]]
+; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+; CHECK:       for.cond.i:
+; CHECK-NEXT:    [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ]
+; CHECK-NEXT:    [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ]
+; CHECK-NEXT:    br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]]
+; CHECK:       land.rhs.i874:
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]]
+; CHECK:       for.end.i:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]]
+; CHECK:       if.then.i:
+; CHECK-NEXT:    [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef
+; CHECK-NEXT:    [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef
+; CHECK-NEXT:    br label [[EXTEND_BW_EXIT:%.*]]
+; CHECK:       if.end.i:
+; CHECK-NEXT:    [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]]
+; CHECK-NEXT:    [[CMP26514_I:%.*]] = icmp slt i32 [[ADD16_I]], 0
+; CHECK-NEXT:    br i1 [[CMP26514_I]], label [[FOR_END33_I:%.*]], label [[FOR_BODY28_LR_PH_I:%.*]]
+; CHECK:       for.body28.lr.ph.i:
+; CHECK-NEXT:    br label [[FOR_END33_I]]
+; CHECK:       for.end33.i:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END58_I:%.*]], label [[FOR_BODY52_LR_PH_I:%.*]]
+; CHECK:       for.body52.lr.ph.i:
+; CHECK-NEXT:    br label [[FOR_END58_I]]
+; CHECK:       for.end58.i:
+; CHECK-NEXT:    br label [[WHILE_COND260_I:%.*]]
+; CHECK:       while.cond260.i:
+; CHECK-NEXT:    br i1 undef, label [[LAND_RHS263_I:%.*]], label [[WHILE_END275_I:%.*]]
+; CHECK:       land.rhs263.i:
+; CHECK-NEXT:    br i1 undef, label [[WHILE_COND260_I]], label [[WHILE_END275_I]]
+; CHECK:       while.end275.i:
+; CHECK-NEXT:    br label [[EXTEND_BW_EXIT]]
+; CHECK:       extend_bw.exit:
+; CHECK-NEXT:    [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ]
+; CHECK-NEXT:    [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ]
+; CHECK-NEXT:    br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]]
+; CHECK:       if.then157:
+; CHECK-NEXT:    [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1
+; CHECK-NEXT:    store i32 [[ADD158]], i32* [[FROM299]], align 4
+; CHECK-NEXT:    [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1
+; CHECK-NEXT:    store i32 [[ADD160]], i32* [[FROM1115]], align 4
+; CHECK-NEXT:    br label [[LAND_LHS_TRUE167]]
+; CHECK:       land.lhs.true167:
+; CHECK-NEXT:    unreachable
+; CHECK:       for.inc603:
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY]], label [[FOR_END605]]
+; CHECK:       for.end605:
+; CHECK-NEXT:    unreachable
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %return, label %if.end
+
+if.end:                                           ; preds = %lor.lhs.false
+  br i1 undef, label %for.end605, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %if.end
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc603, %for.body.lr.ph
+  br i1 undef, label %for.inc603, label %if.end12
+
+if.end12:                                         ; preds = %for.body
+  br i1 undef, label %land.lhs.true, label %land.lhs.true167
+
+land.lhs.true:                                    ; preds = %if.end12
+  br i1 undef, label %if.then17, label %land.lhs.true167
+
+if.then17:                                        ; preds = %land.lhs.true
+  br i1 undef, label %if.end98, label %land.rhs.lr.ph
+
+land.rhs.lr.ph:                                   ; preds = %if.then17
+  unreachable
+
+if.end98:                                         ; preds = %if.then17
+  %from299 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171, %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1
+  br i1 undef, label %land.lhs.true167, label %if.then103
+
+if.then103:                                       ; preds = %if.end98
+  %.sub100 = select i1 undef, i32 250, i32 undef
+  %mul114 = shl nsw i32 %.sub100, 2
+  %from1115 = getelementptr inbounds %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171, %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0
+  %cond125 = select i1 undef, i32 undef, i32 %mul114
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %land.rhs.i874, %if.then103
+  %row.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %.sub100, %if.then103 ]
+  %col.0.i = phi i32 [ undef, %land.rhs.i874 ], [ %cond125, %if.then103 ]
+  br i1 undef, label %land.rhs.i874, label %for.end.i
+
+land.rhs.i874:                                    ; preds = %for.cond.i
+  br i1 undef, label %for.cond.i, label %for.end.i
+
+for.end.i:                                        ; preds = %land.rhs.i874, %for.cond.i
+  br i1 undef, label %if.then.i, label %if.end.i
+
+if.then.i:                                        ; preds = %for.end.i
+  %add14.i = add nsw i32 %row.0.i, undef
+  %add15.i = add nsw i32 %col.0.i, undef
+  br label %extend_bw.exit
+
+if.end.i:                                         ; preds = %for.end.i
+  %add16.i = add i32 %cond125, %.sub100
+  %cmp26514.i = icmp slt i32 %add16.i, 0
+  br i1 %cmp26514.i, label %for.end33.i, label %for.body28.lr.ph.i
+
+for.body28.lr.ph.i:                               ; preds = %if.end.i
+  br label %for.end33.i
+
+for.end33.i:                                      ; preds = %for.body28.lr.ph.i, %if.end.i
+  br i1 undef, label %for.end58.i, label %for.body52.lr.ph.i
+
+for.body52.lr.ph.i:                               ; preds = %for.end33.i
+  br label %for.end58.i
+
+for.end58.i:                                      ; preds = %for.body52.lr.ph.i, %for.end33.i
+  br label %while.cond260.i
+
+while.cond260.i:                                  ; preds = %land.rhs263.i, %for.end58.i
+  br i1 undef, label %land.rhs263.i, label %while.end275.i
+
+land.rhs263.i:                                    ; preds = %while.cond260.i
+  br i1 undef, label %while.cond260.i, label %while.end275.i
+
+while.end275.i:                                   ; preds = %land.rhs263.i, %while.cond260.i
+  br label %extend_bw.exit
+
+extend_bw.exit:                                   ; preds = %while.end275.i, %if.then.i
+  %add14.i1262 = phi i32 [ %add14.i, %if.then.i ], [ undef, %while.end275.i ]
+  %add15.i1261 = phi i32 [ %add15.i, %if.then.i ], [ undef, %while.end275.i ]
+  br i1 false, label %if.then157, label %land.lhs.true167
+
+if.then157:                                       ; preds = %extend_bw.exit
+  %add158 = add nsw i32 %add14.i1262, 1
+  store i32 %add158, i32* %from299, align 4
+  %add160 = add nsw i32 %add15.i1261, 1
+  store i32 %add160, i32* %from1115, align 4
+  br label %land.lhs.true167
+
+land.lhs.true167:                                 ; preds = %if.then157, %extend_bw.exit, %if.end98, %land.lhs.true, %if.end12
+  unreachable
+
+for.inc603:                                       ; preds = %for.body
+  br i1 undef, label %for.body, label %for.end605
+
+for.end605:                                       ; preds = %for.inc603, %if.end
+  unreachable
+
+return:                                           ; preds = %lor.lhs.false, %entry
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
new file mode 100644
index 00000000000..5c753091f95
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.Ray.5.11.53.113.119.137.149.185.329.389.416 = type { %struct.Vec.0.6.48.108.114.132.144.180.324.384.414, %struct.Vec.0.6.48.108.114.132.144.180.324.384.414 }
+%struct.Vec.0.6.48.108.114.132.144.180.324.384.414 = type { double, double, double }
+
+; Function Attrs: ssp uwtable
+define void @main() #0 {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[COND_TRUE:%.*]], label [[COND_END:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    unreachable
+; CHECK:       cond.end:
+; CHECK-NEXT:    br label [[INVOKE_CONT:%.*]]
+; CHECK:       invoke.cont:
+; CHECK-NEXT:    br i1 undef, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]]
+; CHECK:       arrayctor.cont:
+; CHECK-NEXT:    [[AGG_TMP99208_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_113_119_137_149_185_329_389_416:%.*]], %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_113_119_137_149_185_329_389_416]], %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 1, i32 0
+; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER:%.*]]
+; CHECK:       for.cond36.preheader:
+; CHECK-NEXT:    br i1 undef, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]]
+; CHECK:       cond.false51.us:
+; CHECK-NEXT:    unreachable
+; CHECK:       cond.true48.us:
+; CHECK-NEXT:    br i1 undef, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]]
+; CHECK:       cond.false66.us:
+; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 1.400000e+02, double 1.400000e+02>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], <double 5.000000e+01, double 5.200000e+01>
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8
+; CHECK-NEXT:    unreachable
+; CHECK:       cond.true63.us:
+; CHECK-NEXT:    unreachable
+; CHECK:       for.body42.lr.ph.us:
+; CHECK-NEXT:    br i1 undef, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]]
+; CHECK:       _Z5clampd.exit.1:
+; CHECK-NEXT:    br label [[FOR_COND36_PREHEADER]]
+;
+entry:
+  br i1 undef, label %cond.true, label %cond.end
+
+cond.true:                                        ; preds = %entry
+  unreachable
+
+cond.end:                                         ; preds = %entry
+  br label %invoke.cont
+
+invoke.cont:                                      ; preds = %invoke.cont, %cond.end
+  br i1 undef, label %arrayctor.cont, label %invoke.cont
+
+arrayctor.cont:                                   ; preds = %invoke.cont
+  %agg.tmp99208.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416, %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 0
+  %agg.tmp99208.sroa.1.8.idx388 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416, %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 1
+  %agg.tmp101211.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416, %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 1, i32 0
+  %agg.tmp101211.sroa.1.8.idx390 = getelementptr inbounds %struct.Ray.5.11.53.113.119.137.149.185.329.389.416, %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 1, i32 1
+  br label %for.cond36.preheader
+
+for.cond36.preheader:                             ; preds = %_Z5clampd.exit.1, %arrayctor.cont
+  br i1 undef, label %for.body42.lr.ph.us, label %_Z5clampd.exit.1
+
+cond.false51.us:                                  ; preds = %for.body42.lr.ph.us
+  unreachable
+
+cond.true48.us:                                   ; preds = %for.body42.lr.ph.us
+  br i1 undef, label %cond.true63.us, label %cond.false66.us
+
+cond.false66.us:                                  ; preds = %cond.true48.us
+  %add.i276.us = fadd double 0.000000e+00, undef
+  %add.i264.us = fadd double %add.i276.us, 0.000000e+00
+  %add4.i267.us = fadd double undef, 0xBFA5CC2D1960285F
+  %mul.i254.us = fmul double %add.i264.us, 1.400000e+02
+  %mul2.i256.us = fmul double %add4.i267.us, 1.400000e+02
+  %add.i243.us = fadd double %mul.i254.us, 5.000000e+01
+  %add4.i246.us = fadd double %mul2.i256.us, 5.200000e+01
+  %mul.i.i.us = fmul double undef, %add.i264.us
+  %mul2.i.i.us = fmul double undef, %add4.i267.us
+  store double %add.i243.us, double* %agg.tmp99208.sroa.0.0.idx, align 8
+  store double %add4.i246.us, double* %agg.tmp99208.sroa.1.8.idx388, align 8
+  store double %mul.i.i.us, double* %agg.tmp101211.sroa.0.0.idx, align 8
+  store double %mul2.i.i.us, double* %agg.tmp101211.sroa.1.8.idx390, align 8
+  unreachable
+
+cond.true63.us:                                   ; preds = %cond.true48.us
+  unreachable
+
+for.body42.lr.ph.us:                              ; preds = %for.cond36.preheader
+  br i1 undef, label %cond.true48.us, label %cond.false51.us
+
+_Z5clampd.exit.1:                                 ; preds = %for.cond36.preheader
+  br label %for.cond36.preheader
+}
+
+
+%struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601 = type { %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600, %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 }
+%struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 = type { double, double, double }
+
+define void @_Z8radianceRK3RayiPt() #0 {
+; CHECK-LABEL: @_Z8radianceRK3RayiPt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]]
+; CHECK:       if.then38:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double undef, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       if.then78:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %if.then78, label %if.then38
+
+if.then38:                                        ; preds = %entry
+  %mul.i.i790 = fmul double undef, undef
+  %mul3.i.i792 = fmul double undef, undef
+  %mul.i764 = fmul double undef, %mul3.i.i792
+  %mul4.i767 = fmul double undef, undef
+  %sub.i768 = fsub double %mul.i764, %mul4.i767
+  %mul6.i770 = fmul double undef, %mul.i.i790
+  %mul9.i772 = fmul double undef, %mul3.i.i792
+  %sub10.i773 = fsub double %mul6.i770, %mul9.i772
+  %mul.i736 = fmul double undef, %sub.i768
+  %mul2.i738 = fmul double undef, %sub10.i773
+  %mul.i727 = fmul double undef, %mul.i736
+  %mul2.i729 = fmul double undef, %mul2.i738
+  %add.i716 = fadd double undef, %mul.i727
+  %add4.i719 = fadd double undef, %mul2.i729
+  %add.i695 = fadd double undef, %add.i716
+  %add4.i698 = fadd double undef, %add4.i719
+  %mul.i.i679 = fmul double undef, %add.i695
+  %mul2.i.i680 = fmul double undef, %add4.i698
+  %agg.tmp74663.sroa.0.0.idx = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601, %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0
+  store double %mul.i.i679, double* %agg.tmp74663.sroa.0.0.idx, align 8
+  %agg.tmp74663.sroa.1.8.idx943 = getelementptr inbounds %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601, %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 1
+  store double %mul2.i.i680, double* %agg.tmp74663.sroa.1.8.idx943, align 8
+  br label %return
+
+if.then78:                                        ; preds = %entry
+  br label %return
+
+return:                                           ; preds = %if.then78, %if.then38
+  ret void
+}
+
+attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
new file mode 100644
index 00000000000..24f5bad0398
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -mcpu=corei7-avx -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+
+; This test used to crash because we were following phi chains incorrectly.
+; We used indices to get the incoming value of two phi nodes rather than
+; incoming block lookup.
+; This can give wrong results when the ordering of incoming
+; edges in the two phi nodes don't match.
+
+%0 = type { %1, %2 }
+%1 = type { double, double }
+%2 = type { double, double }
+
+
+;define fastcc void @bar() {
+define void @bar() {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[TMP0:%.*]], %0* undef, i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 1
+; CHECK-NEXT:    br label [[TMP7:%.*]]
+; CHECK:         [[TMP8:%.*]] = phi <2 x double> [ <double 1.800000e+01, double 2.800000e+01>, [[TMP0]] ], [ [[TMP11:%.*]], [[TMP21:%.*]] ], [ [[TMP11]], [[TMP18:%.*]] ], [ [[TMP11]], [[TMP18]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
+; CHECK-NEXT:    [[TMP11]] = load <2 x double>, <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    br i1 undef, label [[TMP12:%.*]], label [[TMP13:%.*]]
+; CHECK:         ret void
+; CHECK:         [[TMP14:%.*]] = bitcast double* [[TMP5]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8
+; CHECK-NEXT:    br i1 undef, label [[TMP15:%.*]], label [[TMP16:%.*]]
+; CHECK:         br label [[TMP16]]
+; CHECK:         br i1 undef, label [[TMP17:%.*]], label [[TMP18]]
+; CHECK:         unreachable
+; CHECK:         [[TMP19:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; CHECK-NEXT:    switch i32 undef, label [[TMP21]] [
+; CHECK-NEXT:    i32 32, label [[TMP7]]
+; CHECK-NEXT:    i32 103, label [[TMP7]]
+; CHECK-NEXT:    ]
+; CHECK:         br i1 undef, label [[TMP7]], label [[TMP22:%.*]]
+; CHECK:         unreachable
+;
+  %1 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 0
+  %2 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 1
+  %3 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 0
+  %4 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 1
+  %5 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 0
+  %6 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 1
+  br label %7
+
+; <label>:7                                       ; preds = %18, %17, %17, %0
+  %8 = phi double [ 2.800000e+01, %0 ], [ %11, %18 ], [ %11, %17 ], [ %11, %17 ]
+  %9 = phi double [ 1.800000e+01, %0 ], [ %10, %18 ], [ %10, %17 ], [ %10, %17 ]
+  store double %9, double* %1, align 8
+  store double %8, double* %2, align 8
+  %10 = load double, double* %3, align 8
+  %11 = load double, double* %4, align 8
+  br i1 undef, label %12, label %13
+
+; <label>:12                                      ; preds = %7
+  ret void
+
+; <label>:13                                      ; preds = %7
+  store double %10, double* %5, align 8
+  store double %11, double* %6, align 8
+  br i1 undef, label %14, label %15
+
+; <label>:14                                      ; preds = %13
+  br label %15
+
+; <label>:15                                      ; preds = %14, %13
+  br i1 undef, label %16, label %17
+
+; <label>:16                                      ; preds = %15
+  unreachable
+
+; <label>:17                                      ; preds = %15
+  switch i32 undef, label %18 [
+  i32 32, label %7
+  i32 103, label %7
+  ]
+
+; <label>:18                                      ; preds = %17
+  br i1 undef, label %7, label %19
+
+; <label>:19                                      ; preds = %18
+  unreachable
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
new file mode 100644
index 00000000000..a3d98e39ce1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; int foo(double *A, float *B, int g) {
+;   float B0 = B[0];
+;   float B1 = B[1]; <----- BasicBlock #1
+;   B0 += 5;
+;   B1 += 8;
+;
+;   if (g) bar();
+;
+;   A[0] += B0;     <------- BasicBlock #3
+;   A[1] += B1;
+; }
+
+
+define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <2 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 5.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[G:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @bar()
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load float, float* %B, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %B, i64 1
+  %1 = load float, float* %arrayidx1, align 4
+  %add = fadd float %0, 5.000000e+00
+  %add2 = fadd float %1, 8.000000e+00
+  %tobool = icmp eq i32 %g, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  %call = tail call i32 (...) @bar()
+  br label %if.end
+
+if.end:
+  %conv = fpext float %add to double
+  %2 = load double, double* %A, align 8
+  %add4 = fadd double %conv, %2
+  store double %add4, double* %A, align 8
+  %conv5 = fpext float %add2 to double
+  %arrayidx6 = getelementptr inbounds double, double* %A, i64 1
+  %3 = load double, double* %arrayidx6, align 8
+  %add7 = fadd double %conv5, %3
+  store double %add7, double* %arrayidx6, align 8
+  ret i32 undef
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
new file mode 100644
index 00000000000..d2512dcd615
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+;int test(double *G) {
+;  G[0] = 1+G[5]*4;
+;  G[1] = 6+G[6]*3;
+;  G[2] = 7+G[5]*4;
+;  G[3] = 8+G[6]*4;
+;}
+
+define i32 @test(double* nocapture %G) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[G:%.*]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %arrayidx = getelementptr inbounds double, double* %G, i64 5
+  %0 = load double, double* %arrayidx, align 8
+  %mul = fmul double %0, 4.000000e+00
+  %add = fadd double %mul, 1.000000e+00
+  store double %add, double* %G, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %G, i64 6
+  %1 = load double, double* %arrayidx2, align 8
+  %mul3 = fmul double %1, 3.000000e+00
+  %add4 = fadd double %mul3, 6.000000e+00
+  %arrayidx5 = getelementptr inbounds double, double* %G, i64 1
+  store double %add4, double* %arrayidx5, align 8
+  %add8 = fadd double %mul, 7.000000e+00
+  %arrayidx9 = getelementptr inbounds double, double* %G, i64 2
+  store double %add8, double* %arrayidx9, align 8
+  %mul11 = fmul double %1, 4.000000e+00
+  %add12 = fadd double %mul11, 8.000000e+00
+  %arrayidx13 = getelementptr inbounds double, double* %G, i64 3
+  store double %add12, double* %arrayidx13, align 8
+  ret i32 undef
+}
+
+;int foo(double *A, int n) {
+;  A[0] = A[0] * 7.9 * n + 6.0;
+;  A[1] = A[1] * 7.7 * n + 2.0;
+;  A[2] = A[2] * 7.6 * n + 3.0;
+;  A[3] = A[3] * 7.4 * n + 4.0;
+;}
+
+define i32 @foo(double* nocapture %A, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 7.900000e+00, double 7.700000e+00, double 7.600000e+00, double 7.400000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> undef, double [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CONV]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[CONV]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CONV]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP7]], <double 6.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[A]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP8]], <4 x double>* [[TMP9]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load double, double* %A, align 8
+  %mul = fmul double %0, 7.900000e+00
+  %conv = sitofp i32 %n to double
+  %mul1 = fmul double %conv, %mul
+  %add = fadd double %mul1, 6.000000e+00
+  store double %add, double* %A, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %A, i64 1
+  %1 = load double, double* %arrayidx3, align 8
+  %mul4 = fmul double %1, 7.700000e+00
+  %mul6 = fmul double %conv, %mul4
+  %add7 = fadd double %mul6, 2.000000e+00
+  store double %add7, double* %arrayidx3, align 8
+  %arrayidx9 = getelementptr inbounds double, double* %A, i64 2
+  %2 = load double, double* %arrayidx9, align 8
+  %mul10 = fmul double %2, 7.600000e+00
+  %mul12 = fmul double %conv, %mul10
+  %add13 = fadd double %mul12, 3.000000e+00
+  store double %add13, double* %arrayidx9, align 8
+  %arrayidx15 = getelementptr inbounds double, double* %A, i64 3
+  %3 = load double, double* %arrayidx15, align 8
+  %mul16 = fmul double %3, 7.400000e+00
+  %mul18 = fmul double %conv, %mul16
+  %add19 = fadd double %mul18, 4.000000e+00
+  store double %add19, double* %arrayidx15, align 8
+  ret i32 undef
+}
+
+; int test2(double *G, int k) {
+;   if (k) {
+;     G[0] = 1+G[5]*4;
+;     G[1] = 6+G[6]*3;
+;   } else {
+;     G[2] = 7+G[5]*4;
+;     G[3] = 8+G[6]*3;
+;   }
+; }
+
+; We can't merge the gather sequences because one does not dominate the other.
+
+define i32 @test2(double* nocapture %G, i32 %k) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[K:%.*]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[G:%.*]], i64 5
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[TMP3]], 4.000000e+00
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP14:%.*]], label [[TMP5:%.*]]
+; CHECK:         [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
+; CHECK-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8
+; CHECK-NEXT:    br label [[TMP24:%.*]]
+; CHECK:         [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
+; CHECK-NEXT:    [[TMP17:%.*]] = load double, double* [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], <double 7.000000e+00, double 8.000000e+00>
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8
+; CHECK-NEXT:    br label [[TMP24]]
+; CHECK:         ret i32 undef
+;
+  %1 = icmp eq i32 %k, 0
+  %2 = getelementptr inbounds double, double* %G, i64 5
+  %3 = load double, double* %2, align 8
+  %4 = fmul double %3, 4.000000e+00
+  br i1 %1, label %12, label %5
+
+; <label>:5                                       ; preds = %0
+  %6 = fadd double %4, 1.000000e+00
+  store double %6, double* %G, align 8
+  %7 = getelementptr inbounds double, double* %G, i64 6
+  %8 = load double, double* %7, align 8
+  %9 = fmul double %8, 3.000000e+00
+  %10 = fadd double %9, 6.000000e+00
+  %11 = getelementptr inbounds double, double* %G, i64 1
+  store double %10, double* %11, align 8
+  br label %20
+
+; <label>:12                                      ; preds = %0
+  %13 = fadd double %4, 7.000000e+00
+  %14 = getelementptr inbounds double, double* %G, i64 2
+  store double %13, double* %14, align 8
+  %15 = getelementptr inbounds double, double* %G, i64 6
+  %16 = load double, double* %15, align 8
+  %17 = fmul double %16, 3.000000e+00
+  %18 = fadd double %17, 8.000000e+00
+  %19 = getelementptr inbounds double, double* %G, i64 3
+  store double %18, double* %19, align 8
+  br label %20
+
+; <label>:20                                      ; preds = %12, %5
+  ret i32 undef
+}
+
+
+;int foo(double *A, int n) {
+;  A[0] = A[0] * 7.9 * n + 6.0;
+;  A[1] = A[1] * 7.9 * n + 6.0;
+;  A[2] = A[2] * 7.9 * n + 6.0;
+;  A[3] = A[3] * 7.9 * n + 6.0;
+;}
+
+define i32 @foo4(double* nocapture %A, i32 %n) {
+; CHECK-LABEL: @foo4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 7.900000e+00, double 7.900000e+00, double 7.900000e+00, double 7.900000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> undef, double [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CONV]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[CONV]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CONV]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP7]], <double 6.000000e+00, double 6.000000e+00, double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[A]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP8]], <4 x double>* [[TMP9]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load double, double* %A, align 8
+  %mul = fmul double %0, 7.900000e+00
+  %conv = sitofp i32 %n to double
+  %mul1 = fmul double %conv, %mul
+  %add = fadd double %mul1, 6.000000e+00
+  store double %add, double* %A, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %A, i64 1
+  %1 = load double, double* %arrayidx3, align 8
+  %mul4 = fmul double %1, 7.900000e+00
+  %mul6 = fmul double %conv, %mul4
+  %add7 = fadd double %mul6, 6.000000e+00
+  store double %add7, double* %arrayidx3, align 8
+  %arrayidx9 = getelementptr inbounds double, double* %A, i64 2
+  %2 = load double, double* %arrayidx9, align 8
+  %mul10 = fmul double %2, 7.900000e+00
+  %mul12 = fmul double %conv, %mul10
+  %add13 = fadd double %mul12, 6.000000e+00
+  store double %add13, double* %arrayidx9, align 8
+  %arrayidx15 = getelementptr inbounds double, double* %A, i64 3
+  %3 = load double, double* %arrayidx15, align 8
+  %mul16 = fmul double %3, 7.900000e+00
+  %mul18 = fmul double %conv, %mul16
+  %add19 = fadd double %mul18, 6.000000e+00
+  store double %add19, double* %arrayidx15, align 8
+  ret i32 undef
+}
+
+;int partial_mrg(double *A, int n) {
+;  A[0] = A[0] * n;
+;  A[1] = A[1] * n;
+;  if (n < 4) return 0;
+;  A[2] = A[2] * n;
+;  A[3] = A[3] * (n+4);
+;}
+
+define i32 @partial_mrg(double* nocapture %A, i32 %n) {
+; CHECK-LABEL: @partial_mrg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[N]], 4
+; CHECK-NEXT:    [[CONV12:%.*]] = sitofp i32 [[ADD]] to double
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %0 = load double, double* %A, align 8
+  %conv = sitofp i32 %n to double
+  %mul = fmul double %conv, %0
+  store double %mul, double* %A, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %A, i64 1
+  %1 = load double, double* %arrayidx2, align 8
+  %mul4 = fmul double %conv, %1
+  store double %mul4, double* %arrayidx2, align 8
+  %cmp = icmp slt i32 %n, 4
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %arrayidx7 = getelementptr inbounds double, double* %A, i64 2
+  %2 = load double, double* %arrayidx7, align 8
+  %mul9 = fmul double %conv, %2
+  store double %mul9, double* %arrayidx7, align 8
+  %arrayidx11 = getelementptr inbounds double, double* %A, i64 3
+  %3 = load double, double* %arrayidx11, align 8
+  %add = add nsw i32 %n, 4
+  %conv12 = sitofp i32 %add to double
+  %mul13 = fmul double %conv12, %3
+  store double %mul13, double* %arrayidx11, align 8
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret i32 0
+}
+
+%class.B.53.55 = type { %class.A.52.54, double }
+%class.A.52.54 = type { double, double, double }
+
+@a = external global double, align 8
+
+define void @PR19646(%class.B.53.55* %this) {
+; CHECK-LABEL: @PR19646(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_END13:%.*]], label [[IF_END13]]
+; CHECK:       sw.epilog7:
+; CHECK-NEXT:    [[DOTIN:%.*]] = getelementptr inbounds [[CLASS_B_53_55:%.*]], %class.B.53.55* [[THIS:%.*]], i64 0, i32 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[DOTIN]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double undef, 0.000000e+00
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd double [[ADD]], [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @a, align 8
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd double [[TMP1]], 0.000000e+00
+; CHECK-NEXT:    [[_DY:%.*]] = getelementptr inbounds [[CLASS_B_53_55]], %class.B.53.55* [[THIS]], i64 0, i32 0, i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[_DY]], align 8
+; CHECK-NEXT:    [[ADD10:%.*]] = fadd double [[ADD8]], [[TMP2]]
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN12:%.*]], label [[IF_END13]]
+; CHECK:       if.then12:
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* undef, align 8
+; CHECK-NEXT:    br label [[IF_END13]]
+; CHECK:       if.end13:
+; CHECK-NEXT:    [[X_1:%.*]] = phi double [ 0.000000e+00, [[IF_THEN12]] ], [ [[ADD6]], [[SW_EPILOG7:%.*]] ], [ undef, [[ENTRY:%.*]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT:    [[B_0:%.*]] = phi double [ [[TMP3]], [[IF_THEN12]] ], [ [[ADD10]], [[SW_EPILOG7]] ], [ undef, [[ENTRY]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br i1 undef, label %if.end13, label %if.end13
+
+sw.epilog7:                                       ; No predecessors!
+  %.in = getelementptr inbounds %class.B.53.55, %class.B.53.55* %this, i64 0, i32 0, i32 1
+  %0 = load double, double* %.in, align 8
+  %add = fadd double undef, 0.000000e+00
+  %add6 = fadd double %add, %0
+  %1 = load double, double* @a, align 8
+  %add8 = fadd double %1, 0.000000e+00
+  %_dy = getelementptr inbounds %class.B.53.55, %class.B.53.55* %this, i64 0, i32 0, i32 2
+  %2 = load double, double* %_dy, align 8
+  %add10 = fadd double %add8, %2
+  br i1 undef, label %if.then12, label %if.end13
+
+if.then12:                                        ; preds = %sw.epilog7
+  %3 = load double, double* undef, align 8
+  br label %if.end13
+
+if.end13:                                         ; preds = %if.then12, %sw.epilog7, %entry
+  %x.1 = phi double [ 0.000000e+00, %if.then12 ], [ %add6, %sw.epilog7 ], [ undef, %entry ], [ undef, %entry ]
+  %b.0 = phi double [ %3, %if.then12 ], [ %add10, %sw.epilog7 ], [ undef, %entry], [ undef, %entry ]
+  unreachable
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
new file mode 100644
index 00000000000..cd5a358fc72
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -0,0 +1,905 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [4 x i64] zeroinitializer, align 32
+@dst64 = common global [4 x i64] zeroinitializer, align 32
+@src32 = common global [8 x i32] zeroinitializer, align 32
+@dst32 = common global [8 x i32] zeroinitializer, align 32
+@src16 = common global [16 x i16] zeroinitializer, align 32
+@dst16 = common global [16 x i16] zeroinitializer, align 32
+@src8  = common global [32 x i8] zeroinitializer, align 32
+@dst8  = common global [32 x i8] zeroinitializer, align 32
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare  i8 @llvm.ctlz.i8(i8, i1)
+
+;
+; CTLZ
+;
+
+define void @ctlz_2i64() #0 {
+; CHECK-LABEL: @ctlz_2i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
+; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 0)
+  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 0)
+  store i64 %ctlz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %ctlz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @ctlz_4i64() #0 {
+; CHECK-LABEL: @ctlz_4i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
+; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
+; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
+; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+  %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+  %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 0)
+  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 0)
+  %ctlz2 = call i64 @llvm.ctlz.i64(i64 %ld2, i1 0)
+  %ctlz3 = call i64 @llvm.ctlz.i64(i64 %ld3, i1 0)
+  store i64 %ctlz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+  store i64 %ctlz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+  store i64 %ctlz2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+  store i64 %ctlz3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+  ret void
+}
+
+define void @ctlz_4i32() #0 {
+; CHECK-LABEL: @ctlz_4i32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; CHECK-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; CHECK-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; CHECK-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 0)
+  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 0)
+  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 0)
+  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 0)
+  store i32 %ctlz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %ctlz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %ctlz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %ctlz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @ctlz_8i32() #0 {
+; SSE-LABEL: @ctlz_8i32(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; SSE-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; SSE-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; SSE-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; SSE-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; SSE-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; SSE-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; SSE-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; SSE-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; SSE-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; SSE-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; SSE-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctlz_8i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; AVX1-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; AVX1-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; AVX1-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; AVX1-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; AVX1-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctlz_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX2-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+  %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+  %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+  %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+  %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 0)
+  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 0)
+  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 0)
+  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 0)
+  %ctlz4 = call i32 @llvm.ctlz.i32(i32 %ld4, i1 0)
+  %ctlz5 = call i32 @llvm.ctlz.i32(i32 %ld5, i1 0)
+  %ctlz6 = call i32 @llvm.ctlz.i32(i32 %ld6, i1 0)
+  %ctlz7 = call i32 @llvm.ctlz.i32(i32 %ld7, i1 0)
+  store i32 %ctlz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+  store i32 %ctlz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+  store i32 %ctlz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+  store i32 %ctlz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+  store i32 %ctlz4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+  store i32 %ctlz5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+  store i32 %ctlz6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+  store i32 %ctlz7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @ctlz_8i16() #0 {
+; CHECK-LABEL: @ctlz_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
+  %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
+  %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
+  %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
+  %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
+  %ctlz0 = call i16 @llvm.ctlz.i16(i16 %ld0, i1 0)
+  %ctlz1 = call i16 @llvm.ctlz.i16(i16 %ld1, i1 0)
+  %ctlz2 = call i16 @llvm.ctlz.i16(i16 %ld2, i1 0)
+  %ctlz3 = call i16 @llvm.ctlz.i16(i16 %ld3, i1 0)
+  %ctlz4 = call i16 @llvm.ctlz.i16(i16 %ld4, i1 0)
+  %ctlz5 = call i16 @llvm.ctlz.i16(i16 %ld5, i1 0)
+  %ctlz6 = call i16 @llvm.ctlz.i16(i16 %ld6, i1 0)
+  %ctlz7 = call i16 @llvm.ctlz.i16(i16 %ld7, i1 0)
+  store i16 %ctlz0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
+  store i16 %ctlz1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
+  store i16 %ctlz2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
+  store i16 %ctlz3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
+  store i16 %ctlz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
+  store i16 %ctlz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
+  store i16 %ctlz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
+  store i16 %ctlz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
+  ret void
+}
+
+define void @ctlz_16i16() #0 {
+; SSE-LABEL: @ctlz_16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ctlz_16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 false)
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
+  %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
+  %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
+  %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
+  %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
+  %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
+  %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
+  %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
+  %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
+  %ctlz0  = call i16 @llvm.ctlz.i16(i16 %ld0, i1 0)
+  %ctlz1  = call i16 @llvm.ctlz.i16(i16 %ld1, i1 0)
+  %ctlz2  = call i16 @llvm.ctlz.i16(i16 %ld2, i1 0)
+  %ctlz3  = call i16 @llvm.ctlz.i16(i16 %ld3, i1 0)
+  %ctlz4  = call i16 @llvm.ctlz.i16(i16 %ld4, i1 0)
+  %ctlz5  = call i16 @llvm.ctlz.i16(i16 %ld5, i1 0)
+  %ctlz6  = call i16 @llvm.ctlz.i16(i16 %ld6, i1 0)
+  %ctlz7  = call i16 @llvm.ctlz.i16(i16 %ld7, i1 0)
+  %ctlz8  = call i16 @llvm.ctlz.i16(i16 %ld8, i1 0)
+  %ctlz9  = call i16 @llvm.ctlz.i16(i16 %ld9, i1 0)
+  %ctlz10 = call i16 @llvm.ctlz.i16(i16 %ld10, i1 0)
+  %ctlz11 = call i16 @llvm.ctlz.i16(i16 %ld11, i1 0)
+  %ctlz12 = call i16 @llvm.ctlz.i16(i16 %ld12, i1 0)
+  %ctlz13 = call i16 @llvm.ctlz.i16(i16 %ld13, i1 0)
+  %ctlz14 = call i16 @llvm.ctlz.i16(i16 %ld14, i1 0)
+  %ctlz15 = call i16 @llvm.ctlz.i16(i16 %ld15, i1 0)
+  store i16 %ctlz0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
+  store i16 %ctlz1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
+  store i16 %ctlz2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
+  store i16 %ctlz3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
+  store i16 %ctlz4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
+  store i16 %ctlz5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
+  store i16 %ctlz6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
+  store i16 %ctlz7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
+  store i16 %ctlz8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
+  store i16 %ctlz9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
+  store i16 %ctlz10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
+  store i16 %ctlz11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
+  store i16 %ctlz12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
+  store i16 %ctlz13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
+  store i16 %ctlz14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
+  store i16 %ctlz15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+  ret void
+}
+
+define void @ctlz_16i8() #0 {
+; CHECK-LABEL: @ctlz_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 0)
+  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 0)
+  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 0)
+  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 0)
+  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 0)
+  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 0)
+  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 0)
+  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 0)
+  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 0)
+  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 0)
+  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 0)
+  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 0)
+  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 0)
+  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 0)
+  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 0)
+  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 0)
+  store i8 %ctlz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %ctlz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %ctlz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %ctlz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %ctlz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %ctlz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %ctlz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %ctlz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %ctlz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %ctlz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %ctlz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %ctlz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %ctlz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  ret void
+}
+
+define void @ctlz_32i8() #0 {
+; CHECK-LABEL: @ctlz_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false)
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
+  %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
+  %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
+  %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
+  %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
+  %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
+  %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
+  %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
+  %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
+  %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
+  %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
+  %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
+  %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
+  %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
+  %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
+  %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
+  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 0)
+  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 0)
+  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 0)
+  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 0)
+  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 0)
+  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 0)
+  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 0)
+  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 0)
+  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 0)
+  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 0)
+  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 0)
+  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 0)
+  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 0)
+  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 0)
+  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 0)
+  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 0)
+  %ctlz16 = call i8 @llvm.ctlz.i8(i8 %ld16, i1 0)
+  %ctlz17 = call i8 @llvm.ctlz.i8(i8 %ld17, i1 0)
+  %ctlz18 = call i8 @llvm.ctlz.i8(i8 %ld18, i1 0)
+  %ctlz19 = call i8 @llvm.ctlz.i8(i8 %ld19, i1 0)
+  %ctlz20 = call i8 @llvm.ctlz.i8(i8 %ld20, i1 0)
+  %ctlz21 = call i8 @llvm.ctlz.i8(i8 %ld21, i1 0)
+  %ctlz22 = call i8 @llvm.ctlz.i8(i8 %ld22, i1 0)
+  %ctlz23 = call i8 @llvm.ctlz.i8(i8 %ld23, i1 0)
+  %ctlz24 = call i8 @llvm.ctlz.i8(i8 %ld24, i1 0)
+  %ctlz25 = call i8 @llvm.ctlz.i8(i8 %ld25, i1 0)
+  %ctlz26 = call i8 @llvm.ctlz.i8(i8 %ld26, i1 0)
+  %ctlz27 = call i8 @llvm.ctlz.i8(i8 %ld27, i1 0)
+  %ctlz28 = call i8 @llvm.ctlz.i8(i8 %ld28, i1 0)
+  %ctlz29 = call i8 @llvm.ctlz.i8(i8 %ld29, i1 0)
+  %ctlz30 = call i8 @llvm.ctlz.i8(i8 %ld30, i1 0)
+  %ctlz31 = call i8 @llvm.ctlz.i8(i8 %ld31, i1 0)
+  store i8 %ctlz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %ctlz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %ctlz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %ctlz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %ctlz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %ctlz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %ctlz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %ctlz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %ctlz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %ctlz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %ctlz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %ctlz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %ctlz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  store i8 %ctlz16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
+  store i8 %ctlz17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
+  store i8 %ctlz18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
+  store i8 %ctlz19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
+  store i8 %ctlz20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
+  store i8 %ctlz21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
+  store i8 %ctlz22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
+  store i8 %ctlz23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
+  store i8 %ctlz24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
+  store i8 %ctlz25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
+  store i8 %ctlz26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
+  store i8 %ctlz27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
+  store i8 %ctlz28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
+  store i8 %ctlz29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
+  store i8 %ctlz30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
+  store i8 %ctlz31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
+  ret void
+}
+
+;
+; CTLZ_ZERO_UNDEF
+;
+
+define void @ctlz_undef_2i64() #0 {
+; CHECK-LABEL: @ctlz_undef_2i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
+; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 -1)
+  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 -1)
+  store i64 %ctlz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %ctlz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @ctlz_undef_4i64() #0 {
+; CHECK-LABEL: @ctlz_undef_4i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
+; CHECK-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
+; CHECK-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
+; CHECK-NEXT:    store i64 [[CTLZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; CHECK-NEXT:    store i64 [[CTLZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 [[CTLZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i64 [[CTLZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+  %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+  %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 -1)
+  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 -1)
+  %ctlz2 = call i64 @llvm.ctlz.i64(i64 %ld2, i1 -1)
+  %ctlz3 = call i64 @llvm.ctlz.i64(i64 %ld3, i1 -1)
+  store i64 %ctlz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+  store i64 %ctlz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+  store i64 %ctlz2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+  store i64 %ctlz3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+  ret void
+}
+
+define void @ctlz_undef_4i32() #0 {
+; CHECK-LABEL: @ctlz_undef_4i32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
+; CHECK-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
+; CHECK-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
+; CHECK-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
+; CHECK-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 -1)
+  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 -1)
+  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 -1)
+  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 -1)
+  store i32 %ctlz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %ctlz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %ctlz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %ctlz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @ctlz_undef_8i32() #0 {
+; SSE-LABEL: @ctlz_undef_8i32(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
+; SSE-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
+; SSE-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
+; SSE-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
+; SSE-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
+; SSE-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; SSE-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; SSE-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; SSE-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; SSE-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; SSE-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; SSE-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; SSE-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctlz_undef_8i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
+; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
+; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
+; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
+; AVX1-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
+; AVX1-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
+; AVX1-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
+; AVX1-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
+; AVX1-NEXT:    store i32 [[CTLZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; AVX1-NEXT:    store i32 [[CTLZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT:    store i32 [[CTLZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT:    store i32 [[CTLZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT:    store i32 [[CTLZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT:    store i32 [[CTLZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT:    store i32 [[CTLZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT:    store i32 [[CTLZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctlz_undef_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true)
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX2-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+  %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+  %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+  %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+  %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 -1)
+  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 -1)
+  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 -1)
+  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 -1)
+  %ctlz4 = call i32 @llvm.ctlz.i32(i32 %ld4, i1 -1)
+  %ctlz5 = call i32 @llvm.ctlz.i32(i32 %ld5, i1 -1)
+  %ctlz6 = call i32 @llvm.ctlz.i32(i32 %ld6, i1 -1)
+  %ctlz7 = call i32 @llvm.ctlz.i32(i32 %ld7, i1 -1)
+  store i32 %ctlz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+  store i32 %ctlz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+  store i32 %ctlz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+  store i32 %ctlz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+  store i32 %ctlz4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+  store i32 %ctlz5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+  store i32 %ctlz6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+  store i32 %ctlz7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @ctlz_undef_8i16() #0 {
+; CHECK-LABEL: @ctlz_undef_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
+  %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
+  %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
+  %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
+  %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
+  %ctlz0 = call i16 @llvm.ctlz.i16(i16 %ld0, i1 -1)
+  %ctlz1 = call i16 @llvm.ctlz.i16(i16 %ld1, i1 -1)
+  %ctlz2 = call i16 @llvm.ctlz.i16(i16 %ld2, i1 -1)
+  %ctlz3 = call i16 @llvm.ctlz.i16(i16 %ld3, i1 -1)
+  %ctlz4 = call i16 @llvm.ctlz.i16(i16 %ld4, i1 -1)
+  %ctlz5 = call i16 @llvm.ctlz.i16(i16 %ld5, i1 -1)
+  %ctlz6 = call i16 @llvm.ctlz.i16(i16 %ld6, i1 -1)
+  %ctlz7 = call i16 @llvm.ctlz.i16(i16 %ld7, i1 -1)
+  store i16 %ctlz0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
+  store i16 %ctlz1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
+  store i16 %ctlz2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
+  store i16 %ctlz3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
+  store i16 %ctlz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
+  store i16 %ctlz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
+  store i16 %ctlz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
+  store i16 %ctlz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
+  ret void
+}
+
+define void @ctlz_undef_16i16() #0 {
+; SSE-LABEL: @ctlz_undef_16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 true)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ctlz_undef_16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 true)
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
+  %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
+  %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
+  %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
+  %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
+  %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
+  %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
+  %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
+  %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
+  %ctlz0  = call i16 @llvm.ctlz.i16(i16 %ld0, i1 -1)
+  %ctlz1  = call i16 @llvm.ctlz.i16(i16 %ld1, i1 -1)
+  %ctlz2  = call i16 @llvm.ctlz.i16(i16 %ld2, i1 -1)
+  %ctlz3  = call i16 @llvm.ctlz.i16(i16 %ld3, i1 -1)
+  %ctlz4  = call i16 @llvm.ctlz.i16(i16 %ld4, i1 -1)
+  %ctlz5  = call i16 @llvm.ctlz.i16(i16 %ld5, i1 -1)
+  %ctlz6  = call i16 @llvm.ctlz.i16(i16 %ld6, i1 -1)
+  %ctlz7  = call i16 @llvm.ctlz.i16(i16 %ld7, i1 -1)
+  %ctlz8  = call i16 @llvm.ctlz.i16(i16 %ld8, i1 -1)
+  %ctlz9  = call i16 @llvm.ctlz.i16(i16 %ld9, i1 -1)
+  %ctlz10 = call i16 @llvm.ctlz.i16(i16 %ld10, i1 -1)
+  %ctlz11 = call i16 @llvm.ctlz.i16(i16 %ld11, i1 -1)
+  %ctlz12 = call i16 @llvm.ctlz.i16(i16 %ld12, i1 -1)
+  %ctlz13 = call i16 @llvm.ctlz.i16(i16 %ld13, i1 -1)
+  %ctlz14 = call i16 @llvm.ctlz.i16(i16 %ld14, i1 -1)
+  %ctlz15 = call i16 @llvm.ctlz.i16(i16 %ld15, i1 -1)
+  store i16 %ctlz0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
+  store i16 %ctlz1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
+  store i16 %ctlz2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
+  store i16 %ctlz3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
+  store i16 %ctlz4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
+  store i16 %ctlz5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
+  store i16 %ctlz6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
+  store i16 %ctlz7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
+  store i16 %ctlz8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
+  store i16 %ctlz9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
+  store i16 %ctlz10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
+  store i16 %ctlz11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
+  store i16 %ctlz12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
+  store i16 %ctlz13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
+  store i16 %ctlz14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
+  store i16 %ctlz15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+  ret void
+}
+
+define void @ctlz_undef_16i8() #0 {
+; CHECK-LABEL: @ctlz_undef_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 -1)
+  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 -1)
+  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 -1)
+  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 -1)
+  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 -1)
+  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 -1)
+  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 -1)
+  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 -1)
+  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 -1)
+  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 -1)
+  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 -1)
+  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 -1)
+  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 -1)
+  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 -1)
+  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 -1)
+  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 -1)
+  store i8 %ctlz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %ctlz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %ctlz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %ctlz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %ctlz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %ctlz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %ctlz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %ctlz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %ctlz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %ctlz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %ctlz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %ctlz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %ctlz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  ret void
+}
+
+define void @ctlz_undef_32i8() #0 {
+; CHECK-LABEL: @ctlz_undef_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true)
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
+  %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
+  %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
+  %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
+  %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
+  %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
+  %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
+  %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
+  %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
+  %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
+  %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
+  %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
+  %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
+  %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
+  %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
+  %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
+  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 -1)
+  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 -1)
+  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 -1)
+  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 -1)
+  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 -1)
+  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 -1)
+  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 -1)
+  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 -1)
+  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 -1)
+  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 -1)
+  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 -1)
+  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 -1)
+  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 -1)
+  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 -1)
+  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 -1)
+  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 -1)
+  %ctlz16 = call i8 @llvm.ctlz.i8(i8 %ld16, i1 -1)
+  %ctlz17 = call i8 @llvm.ctlz.i8(i8 %ld17, i1 -1)
+  %ctlz18 = call i8 @llvm.ctlz.i8(i8 %ld18, i1 -1)
+  %ctlz19 = call i8 @llvm.ctlz.i8(i8 %ld19, i1 -1)
+  %ctlz20 = call i8 @llvm.ctlz.i8(i8 %ld20, i1 -1)
+  %ctlz21 = call i8 @llvm.ctlz.i8(i8 %ld21, i1 -1)
+  %ctlz22 = call i8 @llvm.ctlz.i8(i8 %ld22, i1 -1)
+  %ctlz23 = call i8 @llvm.ctlz.i8(i8 %ld23, i1 -1)
+  %ctlz24 = call i8 @llvm.ctlz.i8(i8 %ld24, i1 -1)
+  %ctlz25 = call i8 @llvm.ctlz.i8(i8 %ld25, i1 -1)
+  %ctlz26 = call i8 @llvm.ctlz.i8(i8 %ld26, i1 -1)
+  %ctlz27 = call i8 @llvm.ctlz.i8(i8 %ld27, i1 -1)
+  %ctlz28 = call i8 @llvm.ctlz.i8(i8 %ld28, i1 -1)
+  %ctlz29 = call i8 @llvm.ctlz.i8(i8 %ld29, i1 -1)
+  %ctlz30 = call i8 @llvm.ctlz.i8(i8 %ld30, i1 -1)
+  %ctlz31 = call i8 @llvm.ctlz.i8(i8 %ld31, i1 -1)
+  store i8 %ctlz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %ctlz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %ctlz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %ctlz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %ctlz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %ctlz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %ctlz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %ctlz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %ctlz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %ctlz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %ctlz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %ctlz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %ctlz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %ctlz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %ctlz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %ctlz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  store i8 %ctlz16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
+  store i8 %ctlz17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
+  store i8 %ctlz18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
+  store i8 %ctlz19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
+  store i8 %ctlz20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
+  store i8 %ctlz21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
+  store i8 %ctlz22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
+  store i8 %ctlz23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
+  store i8 %ctlz24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
+  store i8 %ctlz25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
+  store i8 %ctlz26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
+  store i8 %ctlz27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
+  store i8 %ctlz28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
+  store i8 %ctlz29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
+  store i8 %ctlz30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
+  store i8 %ctlz31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
new file mode 100644
index 00000000000..42f19c8ddc8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll
@@ -0,0 +1,512 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+sse2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+sse4.2,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+avx,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mattr=+avx2,+popcnt -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [4 x i64] zeroinitializer, align 32
+@dst64 = common global [4 x i64] zeroinitializer, align 32
+@src32 = common global [8 x i32] zeroinitializer, align 32
+@dst32 = common global [8 x i32] zeroinitializer, align 32
+@src16 = common global [16 x i16] zeroinitializer, align 32
+@dst16 = common global [16 x i16] zeroinitializer, align 32
+@src8  = common global [32 x i8] zeroinitializer, align 32
+@dst8  = common global [32 x i8] zeroinitializer, align 32
+
+declare i64 @llvm.ctpop.i64(i64)
+declare i32 @llvm.ctpop.i32(i32)
+declare i16 @llvm.ctpop.i16(i16)
+declare  i8 @llvm.ctpop.i8(i8)
+
+define void @ctpop_2i64() #0 {
+; CHECK-LABEL: @ctpop_2i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
+; CHECK-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
+; CHECK-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+  %ctpop0 = call i64 @llvm.ctpop.i64(i64 %ld0)
+  %ctpop1 = call i64 @llvm.ctpop.i64(i64 %ld1)
+  store i64 %ctpop0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %ctpop1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @ctpop_4i64() #0 {
+; SSE-LABEL: @ctpop_4i64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
+; SSE-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
+; SSE-NEXT:    [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
+; SSE-NEXT:    [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
+; SSE-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; SSE-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; SSE-NEXT:    store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; SSE-NEXT:    store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctpop_4i64(
+; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; AVX1-NEXT:    [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
+; AVX1-NEXT:    [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
+; AVX1-NEXT:    [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
+; AVX1-NEXT:    [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
+; AVX1-NEXT:    store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; AVX1-NEXT:    store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; AVX1-NEXT:    store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; AVX1-NEXT:    store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctpop_4i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> [[TMP1]])
+; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX2-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+  %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+  %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+  %ctpop0 = call i64 @llvm.ctpop.i64(i64 %ld0)
+  %ctpop1 = call i64 @llvm.ctpop.i64(i64 %ld1)
+  %ctpop2 = call i64 @llvm.ctpop.i64(i64 %ld2)
+  %ctpop3 = call i64 @llvm.ctpop.i64(i64 %ld3)
+  store i64 %ctpop0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+  store i64 %ctpop1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+  store i64 %ctpop2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+  store i64 %ctpop3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+  ret void
+}
+
+define void @ctpop_4i32() #0 {
+; SSE2-LABEL: @ctpop_4i32(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
+; SSE2-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE42-LABEL: @ctpop_4i32(
+; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; SSE42-NEXT:    ret void
+;
+; AVX-LABEL: @ctpop_4i32(
+; AVX-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; AVX-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; AVX-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; AVX-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; AVX-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; AVX-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; AVX-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; AVX-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; AVX-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %ld0)
+  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %ld1)
+  %ctpop2 = call i32 @llvm.ctpop.i32(i32 %ld2)
+  %ctpop3 = call i32 @llvm.ctpop.i32(i32 %ld3)
+  store i32 %ctpop0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %ctpop1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %ctpop2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %ctpop3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @ctpop_8i32() #0 {
+; SSE2-LABEL: @ctpop_8i32(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE2-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]])
+; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]])
+; SSE2-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE2-NEXT:    ret void
+;
+; SSE42-LABEL: @ctpop_8i32(
+; SSE42-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; SSE42-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; SSE42-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; SSE42-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; SSE42-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; SSE42-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; SSE42-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; SSE42-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; SSE42-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; SSE42-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; SSE42-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; SSE42-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; SSE42-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
+; SSE42-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
+; SSE42-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
+; SSE42-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
+; SSE42-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; SSE42-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; SSE42-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; SSE42-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; SSE42-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; SSE42-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; SSE42-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; SSE42-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE42-NEXT:    ret void
+;
+; AVX1-LABEL: @ctpop_8i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[CTPOP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD0]])
+; AVX1-NEXT:    [[CTPOP1:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD1]])
+; AVX1-NEXT:    [[CTPOP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD2]])
+; AVX1-NEXT:    [[CTPOP3:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD3]])
+; AVX1-NEXT:    [[CTPOP4:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD4]])
+; AVX1-NEXT:    [[CTPOP5:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD5]])
+; AVX1-NEXT:    [[CTPOP6:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD6]])
+; AVX1-NEXT:    [[CTPOP7:%.*]] = call i32 @llvm.ctpop.i32(i32 [[LD7]])
+; AVX1-NEXT:    store i32 [[CTPOP0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; AVX1-NEXT:    store i32 [[CTPOP1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT:    store i32 [[CTPOP2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT:    store i32 [[CTPOP3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT:    store i32 [[CTPOP4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT:    store i32 [[CTPOP5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT:    store i32 [[CTPOP6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT:    store i32 [[CTPOP7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ctpop_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> [[TMP1]])
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX2-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+  %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+  %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+  %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+  %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %ld0)
+  %ctpop1 = call i32 @llvm.ctpop.i32(i32 %ld1)
+  %ctpop2 = call i32 @llvm.ctpop.i32(i32 %ld2)
+  %ctpop3 = call i32 @llvm.ctpop.i32(i32 %ld3)
+  %ctpop4 = call i32 @llvm.ctpop.i32(i32 %ld4)
+  %ctpop5 = call i32 @llvm.ctpop.i32(i32 %ld5)
+  %ctpop6 = call i32 @llvm.ctpop.i32(i32 %ld6)
+  %ctpop7 = call i32 @llvm.ctpop.i32(i32 %ld7)
+  store i32 %ctpop0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+  store i32 %ctpop1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+  store i32 %ctpop2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+  store i32 %ctpop3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+  store i32 %ctpop4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+  store i32 %ctpop5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+  store i32 %ctpop6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+  store i32 %ctpop7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @ctpop_8i16() #0 {
+; CHECK-LABEL: @ctpop_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
+  %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
+  %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
+  %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
+  %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
+  %ctpop0 = call i16 @llvm.ctpop.i16(i16 %ld0)
+  %ctpop1 = call i16 @llvm.ctpop.i16(i16 %ld1)
+  %ctpop2 = call i16 @llvm.ctpop.i16(i16 %ld2)
+  %ctpop3 = call i16 @llvm.ctpop.i16(i16 %ld3)
+  %ctpop4 = call i16 @llvm.ctpop.i16(i16 %ld4)
+  %ctpop5 = call i16 @llvm.ctpop.i16(i16 %ld5)
+  %ctpop6 = call i16 @llvm.ctpop.i16(i16 %ld6)
+  %ctpop7 = call i16 @llvm.ctpop.i16(i16 %ld7)
+  store i16 %ctpop0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
+  store i16 %ctpop1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
+  store i16 %ctpop2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
+  store i16 %ctpop3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
+  store i16 %ctpop4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
+  store i16 %ctpop5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
+  store i16 %ctpop6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
+  store i16 %ctpop7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
+  ret void
+}
+
+define void @ctpop_16i16() #0 {
+; SSE-LABEL: @ctpop_16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ctpop_16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> [[TMP1]])
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
+  %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
+  %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
+  %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
+  %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
+  %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
+  %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
+  %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
+  %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
+  %ctpop0  = call i16 @llvm.ctpop.i16(i16 %ld0)
+  %ctpop1  = call i16 @llvm.ctpop.i16(i16 %ld1)
+  %ctpop2  = call i16 @llvm.ctpop.i16(i16 %ld2)
+  %ctpop3  = call i16 @llvm.ctpop.i16(i16 %ld3)
+  %ctpop4  = call i16 @llvm.ctpop.i16(i16 %ld4)
+  %ctpop5  = call i16 @llvm.ctpop.i16(i16 %ld5)
+  %ctpop6  = call i16 @llvm.ctpop.i16(i16 %ld6)
+  %ctpop7  = call i16 @llvm.ctpop.i16(i16 %ld7)
+  %ctpop8  = call i16 @llvm.ctpop.i16(i16 %ld8)
+  %ctpop9  = call i16 @llvm.ctpop.i16(i16 %ld9)
+  %ctpop10 = call i16 @llvm.ctpop.i16(i16 %ld10)
+  %ctpop11 = call i16 @llvm.ctpop.i16(i16 %ld11)
+  %ctpop12 = call i16 @llvm.ctpop.i16(i16 %ld12)
+  %ctpop13 = call i16 @llvm.ctpop.i16(i16 %ld13)
+  %ctpop14 = call i16 @llvm.ctpop.i16(i16 %ld14)
+  %ctpop15 = call i16 @llvm.ctpop.i16(i16 %ld15)
+  store i16 %ctpop0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
+  store i16 %ctpop1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
+  store i16 %ctpop2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
+  store i16 %ctpop3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
+  store i16 %ctpop4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
+  store i16 %ctpop5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
+  store i16 %ctpop6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
+  store i16 %ctpop7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
+  store i16 %ctpop8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
+  store i16 %ctpop9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
+  store i16 %ctpop10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
+  store i16 %ctpop11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
+  store i16 %ctpop12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
+  store i16 %ctpop13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
+  store i16 %ctpop14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
+  store i16 %ctpop15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+  ret void
+}
+
+define void @ctpop_16i8() #0 {
+; CHECK-LABEL: @ctpop_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ctpop0  = call i8 @llvm.ctpop.i8(i8 %ld0)
+  %ctpop1  = call i8 @llvm.ctpop.i8(i8 %ld1)
+  %ctpop2  = call i8 @llvm.ctpop.i8(i8 %ld2)
+  %ctpop3  = call i8 @llvm.ctpop.i8(i8 %ld3)
+  %ctpop4  = call i8 @llvm.ctpop.i8(i8 %ld4)
+  %ctpop5  = call i8 @llvm.ctpop.i8(i8 %ld5)
+  %ctpop6  = call i8 @llvm.ctpop.i8(i8 %ld6)
+  %ctpop7  = call i8 @llvm.ctpop.i8(i8 %ld7)
+  %ctpop8  = call i8 @llvm.ctpop.i8(i8 %ld8)
+  %ctpop9  = call i8 @llvm.ctpop.i8(i8 %ld9)
+  %ctpop10 = call i8 @llvm.ctpop.i8(i8 %ld10)
+  %ctpop11 = call i8 @llvm.ctpop.i8(i8 %ld11)
+  %ctpop12 = call i8 @llvm.ctpop.i8(i8 %ld12)
+  %ctpop13 = call i8 @llvm.ctpop.i8(i8 %ld13)
+  %ctpop14 = call i8 @llvm.ctpop.i8(i8 %ld14)
+  %ctpop15 = call i8 @llvm.ctpop.i8(i8 %ld15)
+  store i8 %ctpop0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %ctpop1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %ctpop2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %ctpop3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %ctpop4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %ctpop5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %ctpop6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %ctpop7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %ctpop8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %ctpop9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %ctpop10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %ctpop11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %ctpop12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %ctpop13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %ctpop14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %ctpop15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  ret void
+}
+
+define void @ctpop_32i8() #0 {
+; CHECK-LABEL: @ctpop_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP2]])
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
+  %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
+  %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
+  %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
+  %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
+  %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
+  %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
+  %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
+  %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
+  %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
+  %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
+  %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
+  %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
+  %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
+  %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
+  %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
+  %ctpop0  = call i8 @llvm.ctpop.i8(i8 %ld0)
+  %ctpop1  = call i8 @llvm.ctpop.i8(i8 %ld1)
+  %ctpop2  = call i8 @llvm.ctpop.i8(i8 %ld2)
+  %ctpop3  = call i8 @llvm.ctpop.i8(i8 %ld3)
+  %ctpop4  = call i8 @llvm.ctpop.i8(i8 %ld4)
+  %ctpop5  = call i8 @llvm.ctpop.i8(i8 %ld5)
+  %ctpop6  = call i8 @llvm.ctpop.i8(i8 %ld6)
+  %ctpop7  = call i8 @llvm.ctpop.i8(i8 %ld7)
+  %ctpop8  = call i8 @llvm.ctpop.i8(i8 %ld8)
+  %ctpop9  = call i8 @llvm.ctpop.i8(i8 %ld9)
+  %ctpop10 = call i8 @llvm.ctpop.i8(i8 %ld10)
+  %ctpop11 = call i8 @llvm.ctpop.i8(i8 %ld11)
+  %ctpop12 = call i8 @llvm.ctpop.i8(i8 %ld12)
+  %ctpop13 = call i8 @llvm.ctpop.i8(i8 %ld13)
+  %ctpop14 = call i8 @llvm.ctpop.i8(i8 %ld14)
+  %ctpop15 = call i8 @llvm.ctpop.i8(i8 %ld15)
+  %ctpop16 = call i8 @llvm.ctpop.i8(i8 %ld16)
+  %ctpop17 = call i8 @llvm.ctpop.i8(i8 %ld17)
+  %ctpop18 = call i8 @llvm.ctpop.i8(i8 %ld18)
+  %ctpop19 = call i8 @llvm.ctpop.i8(i8 %ld19)
+  %ctpop20 = call i8 @llvm.ctpop.i8(i8 %ld20)
+  %ctpop21 = call i8 @llvm.ctpop.i8(i8 %ld21)
+  %ctpop22 = call i8 @llvm.ctpop.i8(i8 %ld22)
+  %ctpop23 = call i8 @llvm.ctpop.i8(i8 %ld23)
+  %ctpop24 = call i8 @llvm.ctpop.i8(i8 %ld24)
+  %ctpop25 = call i8 @llvm.ctpop.i8(i8 %ld25)
+  %ctpop26 = call i8 @llvm.ctpop.i8(i8 %ld26)
+  %ctpop27 = call i8 @llvm.ctpop.i8(i8 %ld27)
+  %ctpop28 = call i8 @llvm.ctpop.i8(i8 %ld28)
+  %ctpop29 = call i8 @llvm.ctpop.i8(i8 %ld29)
+  %ctpop30 = call i8 @llvm.ctpop.i8(i8 %ld30)
+  %ctpop31 = call i8 @llvm.ctpop.i8(i8 %ld31)
+  store i8 %ctpop0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %ctpop1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %ctpop2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %ctpop3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %ctpop4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %ctpop5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %ctpop6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %ctpop7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %ctpop8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %ctpop9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %ctpop10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %ctpop11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %ctpop12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %ctpop13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %ctpop14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %ctpop15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  store i8 %ctpop16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
+  store i8 %ctpop17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
+  store i8 %ctpop18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
+  store i8 %ctpop19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
+  store i8 %ctpop20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
+  store i8 %ctpop21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
+  store i8 %ctpop22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
+  store i8 %ctpop23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
+  store i8 %ctpop24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
+  store i8 %ctpop25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
+  store i8 %ctpop26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
+  store i8 %ctpop27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
+  store i8 %ctpop28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
+  store i8 %ctpop29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
+  store i8 %ctpop30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
+  store i8 %ctpop31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
new file mode 100644
index 00000000000..751fea779ce
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -0,0 +1,905 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [4 x i64] zeroinitializer, align 32
+@dst64 = common global [4 x i64] zeroinitializer, align 32
+@src32 = common global [8 x i32] zeroinitializer, align 32
+@dst32 = common global [8 x i32] zeroinitializer, align 32
+@src16 = common global [16 x i16] zeroinitializer, align 32
+@dst16 = common global [16 x i16] zeroinitializer, align 32
+@src8  = common global [32 x i8] zeroinitializer, align 32
+@dst8  = common global [32 x i8] zeroinitializer, align 32
+
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare  i8 @llvm.cttz.i8(i8, i1)
+
+;
+; CTTZ
+;
+
+define void @cttz_2i64() #0 {
+; CHECK-LABEL: @cttz_2i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
+; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
+; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+  %cttz0 = call i64 @llvm.cttz.i64(i64 %ld0, i1 0)
+  %cttz1 = call i64 @llvm.cttz.i64(i64 %ld1, i1 0)
+  store i64 %cttz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cttz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @cttz_4i64() #0 {
+; CHECK-LABEL: @cttz_4i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 false)
+; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 false)
+; CHECK-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 false)
+; CHECK-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 false)
+; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+  %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+  %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+  %cttz0 = call i64 @llvm.cttz.i64(i64 %ld0, i1 0)
+  %cttz1 = call i64 @llvm.cttz.i64(i64 %ld1, i1 0)
+  %cttz2 = call i64 @llvm.cttz.i64(i64 %ld2, i1 0)
+  %cttz3 = call i64 @llvm.cttz.i64(i64 %ld3, i1 0)
+  store i64 %cttz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+  store i64 %cttz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+  store i64 %cttz2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+  store i64 %cttz3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+  ret void
+}
+
+define void @cttz_4i32() #0 {
+; CHECK-LABEL: @cttz_4i32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
+; CHECK-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
+; CHECK-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
+; CHECK-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
+; CHECK-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+  %cttz0 = call i32 @llvm.cttz.i32(i32 %ld0, i1 0)
+  %cttz1 = call i32 @llvm.cttz.i32(i32 %ld1, i1 0)
+  %cttz2 = call i32 @llvm.cttz.i32(i32 %ld2, i1 0)
+  %cttz3 = call i32 @llvm.cttz.i32(i32 %ld3, i1 0)
+  store i32 %cttz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cttz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cttz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cttz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @cttz_8i32() #0 {
+; SSE-LABEL: @cttz_8i32(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT:    [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false)
+; SSE-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false)
+; SSE-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false)
+; SSE-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false)
+; SSE-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; SSE-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; SSE-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; SSE-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; SSE-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; SSE-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; SSE-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; SSE-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @cttz_8i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT:    [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false)
+; AVX1-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false)
+; AVX1-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false)
+; AVX1-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false)
+; AVX1-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; AVX1-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @cttz_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX2-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+  %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+  %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+  %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+  %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+  %cttz0 = call i32 @llvm.cttz.i32(i32 %ld0, i1 0)
+  %cttz1 = call i32 @llvm.cttz.i32(i32 %ld1, i1 0)
+  %cttz2 = call i32 @llvm.cttz.i32(i32 %ld2, i1 0)
+  %cttz3 = call i32 @llvm.cttz.i32(i32 %ld3, i1 0)
+  %cttz4 = call i32 @llvm.cttz.i32(i32 %ld4, i1 0)
+  %cttz5 = call i32 @llvm.cttz.i32(i32 %ld5, i1 0)
+  %cttz6 = call i32 @llvm.cttz.i32(i32 %ld6, i1 0)
+  %cttz7 = call i32 @llvm.cttz.i32(i32 %ld7, i1 0)
+  store i32 %cttz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+  store i32 %cttz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+  store i32 %cttz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+  store i32 %cttz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+  store i32 %cttz4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+  store i32 %cttz5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+  store i32 %cttz6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+  store i32 %cttz7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @cttz_8i16() #0 {
+; CHECK-LABEL: @cttz_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
+  %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
+  %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
+  %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
+  %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
+  %cttz0 = call i16 @llvm.cttz.i16(i16 %ld0, i1 0)
+  %cttz1 = call i16 @llvm.cttz.i16(i16 %ld1, i1 0)
+  %cttz2 = call i16 @llvm.cttz.i16(i16 %ld2, i1 0)
+  %cttz3 = call i16 @llvm.cttz.i16(i16 %ld3, i1 0)
+  %cttz4 = call i16 @llvm.cttz.i16(i16 %ld4, i1 0)
+  %cttz5 = call i16 @llvm.cttz.i16(i16 %ld5, i1 0)
+  %cttz6 = call i16 @llvm.cttz.i16(i16 %ld6, i1 0)
+  %cttz7 = call i16 @llvm.cttz.i16(i16 %ld7, i1 0)
+  store i16 %cttz0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
+  store i16 %cttz1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
+  store i16 %cttz2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
+  store i16 %cttz3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
+  store i16 %cttz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
+  store i16 %cttz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
+  store i16 %cttz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
+  store i16 %cttz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
+  ret void
+}
+
+define void @cttz_16i16() #0 {
+; SSE-LABEL: @cttz_16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 false)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @cttz_16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> [[TMP1]], i1 false)
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
+  %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
+  %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
+  %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
+  %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
+  %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
+  %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
+  %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
+  %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
+  %cttz0  = call i16 @llvm.cttz.i16(i16 %ld0, i1 0)
+  %cttz1  = call i16 @llvm.cttz.i16(i16 %ld1, i1 0)
+  %cttz2  = call i16 @llvm.cttz.i16(i16 %ld2, i1 0)
+  %cttz3  = call i16 @llvm.cttz.i16(i16 %ld3, i1 0)
+  %cttz4  = call i16 @llvm.cttz.i16(i16 %ld4, i1 0)
+  %cttz5  = call i16 @llvm.cttz.i16(i16 %ld5, i1 0)
+  %cttz6  = call i16 @llvm.cttz.i16(i16 %ld6, i1 0)
+  %cttz7  = call i16 @llvm.cttz.i16(i16 %ld7, i1 0)
+  %cttz8  = call i16 @llvm.cttz.i16(i16 %ld8, i1 0)
+  %cttz9  = call i16 @llvm.cttz.i16(i16 %ld9, i1 0)
+  %cttz10 = call i16 @llvm.cttz.i16(i16 %ld10, i1 0)
+  %cttz11 = call i16 @llvm.cttz.i16(i16 %ld11, i1 0)
+  %cttz12 = call i16 @llvm.cttz.i16(i16 %ld12, i1 0)
+  %cttz13 = call i16 @llvm.cttz.i16(i16 %ld13, i1 0)
+  %cttz14 = call i16 @llvm.cttz.i16(i16 %ld14, i1 0)
+  %cttz15 = call i16 @llvm.cttz.i16(i16 %ld15, i1 0)
+  store i16 %cttz0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
+  store i16 %cttz1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
+  store i16 %cttz2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
+  store i16 %cttz3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
+  store i16 %cttz4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
+  store i16 %cttz5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
+  store i16 %cttz6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
+  store i16 %cttz7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
+  store i16 %cttz8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
+  store i16 %cttz9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
+  store i16 %cttz10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
+  store i16 %cttz11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
+  store i16 %cttz12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
+  store i16 %cttz13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
+  store i16 %cttz14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
+  store i16 %cttz15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+  ret void
+}
+
+define void @cttz_16i8() #0 {
+; CHECK-LABEL: @cttz_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %cttz0  = call i8 @llvm.cttz.i8(i8 %ld0, i1 0)
+  %cttz1  = call i8 @llvm.cttz.i8(i8 %ld1, i1 0)
+  %cttz2  = call i8 @llvm.cttz.i8(i8 %ld2, i1 0)
+  %cttz3  = call i8 @llvm.cttz.i8(i8 %ld3, i1 0)
+  %cttz4  = call i8 @llvm.cttz.i8(i8 %ld4, i1 0)
+  %cttz5  = call i8 @llvm.cttz.i8(i8 %ld5, i1 0)
+  %cttz6  = call i8 @llvm.cttz.i8(i8 %ld6, i1 0)
+  %cttz7  = call i8 @llvm.cttz.i8(i8 %ld7, i1 0)
+  %cttz8  = call i8 @llvm.cttz.i8(i8 %ld8, i1 0)
+  %cttz9  = call i8 @llvm.cttz.i8(i8 %ld9, i1 0)
+  %cttz10 = call i8 @llvm.cttz.i8(i8 %ld10, i1 0)
+  %cttz11 = call i8 @llvm.cttz.i8(i8 %ld11, i1 0)
+  %cttz12 = call i8 @llvm.cttz.i8(i8 %ld12, i1 0)
+  %cttz13 = call i8 @llvm.cttz.i8(i8 %ld13, i1 0)
+  %cttz14 = call i8 @llvm.cttz.i8(i8 %ld14, i1 0)
+  %cttz15 = call i8 @llvm.cttz.i8(i8 %ld15, i1 0)
+  store i8 %cttz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %cttz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %cttz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %cttz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %cttz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %cttz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %cttz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %cttz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %cttz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %cttz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %cttz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %cttz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %cttz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  ret void
+}
+
+define void @cttz_32i8() #0 {
+; CHECK-LABEL: @cttz_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 false)
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
+  %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
+  %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
+  %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
+  %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
+  %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
+  %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
+  %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
+  %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
+  %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
+  %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
+  %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
+  %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
+  %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
+  %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
+  %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
+  %cttz0  = call i8 @llvm.cttz.i8(i8 %ld0, i1 0)
+  %cttz1  = call i8 @llvm.cttz.i8(i8 %ld1, i1 0)
+  %cttz2  = call i8 @llvm.cttz.i8(i8 %ld2, i1 0)
+  %cttz3  = call i8 @llvm.cttz.i8(i8 %ld3, i1 0)
+  %cttz4  = call i8 @llvm.cttz.i8(i8 %ld4, i1 0)
+  %cttz5  = call i8 @llvm.cttz.i8(i8 %ld5, i1 0)
+  %cttz6  = call i8 @llvm.cttz.i8(i8 %ld6, i1 0)
+  %cttz7  = call i8 @llvm.cttz.i8(i8 %ld7, i1 0)
+  %cttz8  = call i8 @llvm.cttz.i8(i8 %ld8, i1 0)
+  %cttz9  = call i8 @llvm.cttz.i8(i8 %ld9, i1 0)
+  %cttz10 = call i8 @llvm.cttz.i8(i8 %ld10, i1 0)
+  %cttz11 = call i8 @llvm.cttz.i8(i8 %ld11, i1 0)
+  %cttz12 = call i8 @llvm.cttz.i8(i8 %ld12, i1 0)
+  %cttz13 = call i8 @llvm.cttz.i8(i8 %ld13, i1 0)
+  %cttz14 = call i8 @llvm.cttz.i8(i8 %ld14, i1 0)
+  %cttz15 = call i8 @llvm.cttz.i8(i8 %ld15, i1 0)
+  %cttz16 = call i8 @llvm.cttz.i8(i8 %ld16, i1 0)
+  %cttz17 = call i8 @llvm.cttz.i8(i8 %ld17, i1 0)
+  %cttz18 = call i8 @llvm.cttz.i8(i8 %ld18, i1 0)
+  %cttz19 = call i8 @llvm.cttz.i8(i8 %ld19, i1 0)
+  %cttz20 = call i8 @llvm.cttz.i8(i8 %ld20, i1 0)
+  %cttz21 = call i8 @llvm.cttz.i8(i8 %ld21, i1 0)
+  %cttz22 = call i8 @llvm.cttz.i8(i8 %ld22, i1 0)
+  %cttz23 = call i8 @llvm.cttz.i8(i8 %ld23, i1 0)
+  %cttz24 = call i8 @llvm.cttz.i8(i8 %ld24, i1 0)
+  %cttz25 = call i8 @llvm.cttz.i8(i8 %ld25, i1 0)
+  %cttz26 = call i8 @llvm.cttz.i8(i8 %ld26, i1 0)
+  %cttz27 = call i8 @llvm.cttz.i8(i8 %ld27, i1 0)
+  %cttz28 = call i8 @llvm.cttz.i8(i8 %ld28, i1 0)
+  %cttz29 = call i8 @llvm.cttz.i8(i8 %ld29, i1 0)
+  %cttz30 = call i8 @llvm.cttz.i8(i8 %ld30, i1 0)
+  %cttz31 = call i8 @llvm.cttz.i8(i8 %ld31, i1 0)
+  store i8 %cttz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %cttz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %cttz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %cttz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %cttz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %cttz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %cttz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %cttz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %cttz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %cttz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %cttz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %cttz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %cttz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  store i8 %cttz16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
+  store i8 %cttz17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
+  store i8 %cttz18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
+  store i8 %cttz19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
+  store i8 %cttz20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
+  store i8 %cttz21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
+  store i8 %cttz22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
+  store i8 %cttz23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
+  store i8 %cttz24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
+  store i8 %cttz25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
+  store i8 %cttz26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
+  store i8 %cttz27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
+  store i8 %cttz28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
+  store i8 %cttz29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
+  store i8 %cttz30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
+  store i8 %cttz31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
+  ret void
+}
+
+;
+; CTTZ_ZERO_UNDEF
+;
+
+define void @cttz_undef_2i64() #0 {
+; CHECK-LABEL: @cttz_undef_2i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
+; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
+; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
+  %cttz0 = call i64 @llvm.cttz.i64(i64 %ld0, i1 -1)
+  %cttz1 = call i64 @llvm.cttz.i64(i64 %ld1, i1 -1)
+  store i64 %cttz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cttz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @cttz_undef_4i64() #0 {
+; CHECK-LABEL: @cttz_undef_4i64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[CTTZ0:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD0]], i1 true)
+; CHECK-NEXT:    [[CTTZ1:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD1]], i1 true)
+; CHECK-NEXT:    [[CTTZ2:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD2]], i1 true)
+; CHECK-NEXT:    [[CTTZ3:%.*]] = call i64 @llvm.cttz.i64(i64 [[LD3]], i1 true)
+; CHECK-NEXT:    store i64 [[CTTZ0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+; CHECK-NEXT:    store i64 [[CTTZ1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+; CHECK-NEXT:    store i64 [[CTTZ2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+; CHECK-NEXT:    store i64 [[CTTZ3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
+  %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
+  %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
+  %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
+  %cttz0 = call i64 @llvm.cttz.i64(i64 %ld0, i1 -1)
+  %cttz1 = call i64 @llvm.cttz.i64(i64 %ld1, i1 -1)
+  %cttz2 = call i64 @llvm.cttz.i64(i64 %ld2, i1 -1)
+  %cttz3 = call i64 @llvm.cttz.i64(i64 %ld3, i1 -1)
+  store i64 %cttz0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
+  store i64 %cttz1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
+  store i64 %cttz2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
+  store i64 %cttz3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+  ret void
+}
+
+define void @cttz_undef_4i32() #0 {
+; CHECK-LABEL: @cttz_undef_4i32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
+; CHECK-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
+; CHECK-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
+; CHECK-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
+; CHECK-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+; CHECK-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+; CHECK-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
+  %cttz0 = call i32 @llvm.cttz.i32(i32 %ld0, i1 -1)
+  %cttz1 = call i32 @llvm.cttz.i32(i32 %ld1, i1 -1)
+  %cttz2 = call i32 @llvm.cttz.i32(i32 %ld2, i1 -1)
+  %cttz3 = call i32 @llvm.cttz.i32(i32 %ld3, i1 -1)
+  store i32 %cttz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cttz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cttz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cttz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @cttz_undef_8i32() #0 {
+; SSE-LABEL: @cttz_undef_8i32(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
+; SSE-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
+; SSE-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
+; SSE-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
+; SSE-NEXT:    [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 true)
+; SSE-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true)
+; SSE-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true)
+; SSE-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true)
+; SSE-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; SSE-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; SSE-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; SSE-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; SSE-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; SSE-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; SSE-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; SSE-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @cttz_undef_8i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+; AVX1-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+; AVX1-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+; AVX1-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+; AVX1-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 true)
+; AVX1-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 true)
+; AVX1-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 true)
+; AVX1-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 true)
+; AVX1-NEXT:    [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 true)
+; AVX1-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 true)
+; AVX1-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 true)
+; AVX1-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 true)
+; AVX1-NEXT:    store i32 [[CTTZ0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+; AVX1-NEXT:    store i32 [[CTTZ1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT:    store i32 [[CTTZ2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT:    store i32 [[CTTZ3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT:    store i32 [[CTTZ4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT:    store i32 [[CTTZ5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT:    store i32 [[CTTZ6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT:    store i32 [[CTTZ7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @cttz_undef_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 true)
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
+; AVX2-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
+  %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
+  %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
+  %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
+  %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
+  %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
+  %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
+  %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
+  %cttz0 = call i32 @llvm.cttz.i32(i32 %ld0, i1 -1)
+  %cttz1 = call i32 @llvm.cttz.i32(i32 %ld1, i1 -1)
+  %cttz2 = call i32 @llvm.cttz.i32(i32 %ld2, i1 -1)
+  %cttz3 = call i32 @llvm.cttz.i32(i32 %ld3, i1 -1)
+  %cttz4 = call i32 @llvm.cttz.i32(i32 %ld4, i1 -1)
+  %cttz5 = call i32 @llvm.cttz.i32(i32 %ld5, i1 -1)
+  %cttz6 = call i32 @llvm.cttz.i32(i32 %ld6, i1 -1)
+  %cttz7 = call i32 @llvm.cttz.i32(i32 %ld7, i1 -1)
+  store i32 %cttz0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
+  store i32 %cttz1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
+  store i32 %cttz2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
+  store i32 %cttz3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
+  store i32 %cttz4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
+  store i32 %cttz5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
+  store i32 %cttz6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
+  store i32 %cttz7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @cttz_undef_8i16() #0 {
+; CHECK-LABEL: @cttz_undef_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
+  %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
+  %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
+  %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
+  %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
+  %cttz0 = call i16 @llvm.cttz.i16(i16 %ld0, i1 -1)
+  %cttz1 = call i16 @llvm.cttz.i16(i16 %ld1, i1 -1)
+  %cttz2 = call i16 @llvm.cttz.i16(i16 %ld2, i1 -1)
+  %cttz3 = call i16 @llvm.cttz.i16(i16 %ld3, i1 -1)
+  %cttz4 = call i16 @llvm.cttz.i16(i16 %ld4, i1 -1)
+  %cttz5 = call i16 @llvm.cttz.i16(i16 %ld5, i1 -1)
+  %cttz6 = call i16 @llvm.cttz.i16(i16 %ld6, i1 -1)
+  %cttz7 = call i16 @llvm.cttz.i16(i16 %ld7, i1 -1)
+  store i16 %cttz0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
+  store i16 %cttz1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
+  store i16 %cttz2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
+  store i16 %cttz3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
+  store i16 %cttz4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
+  store i16 %cttz5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
+  store i16 %cttz6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
+  store i16 %cttz7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
+  ret void
+}
+
+define void @cttz_undef_16i16() #0 {
+; SSE-LABEL: @cttz_undef_16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true)
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 true)
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @cttz_undef_16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> [[TMP1]], i1 true)
+; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
+  %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
+  %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
+  %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
+  %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
+  %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
+  %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
+  %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
+  %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
+  %cttz0  = call i16 @llvm.cttz.i16(i16 %ld0, i1 -1)
+  %cttz1  = call i16 @llvm.cttz.i16(i16 %ld1, i1 -1)
+  %cttz2  = call i16 @llvm.cttz.i16(i16 %ld2, i1 -1)
+  %cttz3  = call i16 @llvm.cttz.i16(i16 %ld3, i1 -1)
+  %cttz4  = call i16 @llvm.cttz.i16(i16 %ld4, i1 -1)
+  %cttz5  = call i16 @llvm.cttz.i16(i16 %ld5, i1 -1)
+  %cttz6  = call i16 @llvm.cttz.i16(i16 %ld6, i1 -1)
+  %cttz7  = call i16 @llvm.cttz.i16(i16 %ld7, i1 -1)
+  %cttz8  = call i16 @llvm.cttz.i16(i16 %ld8, i1 -1)
+  %cttz9  = call i16 @llvm.cttz.i16(i16 %ld9, i1 -1)
+  %cttz10 = call i16 @llvm.cttz.i16(i16 %ld10, i1 -1)
+  %cttz11 = call i16 @llvm.cttz.i16(i16 %ld11, i1 -1)
+  %cttz12 = call i16 @llvm.cttz.i16(i16 %ld12, i1 -1)
+  %cttz13 = call i16 @llvm.cttz.i16(i16 %ld13, i1 -1)
+  %cttz14 = call i16 @llvm.cttz.i16(i16 %ld14, i1 -1)
+  %cttz15 = call i16 @llvm.cttz.i16(i16 %ld15, i1 -1)
+  store i16 %cttz0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
+  store i16 %cttz1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
+  store i16 %cttz2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
+  store i16 %cttz3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
+  store i16 %cttz4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
+  store i16 %cttz5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
+  store i16 %cttz6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
+  store i16 %cttz7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
+  store i16 %cttz8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
+  store i16 %cttz9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
+  store i16 %cttz10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
+  store i16 %cttz11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
+  store i16 %cttz12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
+  store i16 %cttz13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
+  store i16 %cttz14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
+  store i16 %cttz15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+  ret void
+}
+
+define void @cttz_undef_16i8() #0 {
+; CHECK-LABEL: @cttz_undef_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %cttz0  = call i8 @llvm.cttz.i8(i8 %ld0, i1 -1)
+  %cttz1  = call i8 @llvm.cttz.i8(i8 %ld1, i1 -1)
+  %cttz2  = call i8 @llvm.cttz.i8(i8 %ld2, i1 -1)
+  %cttz3  = call i8 @llvm.cttz.i8(i8 %ld3, i1 -1)
+  %cttz4  = call i8 @llvm.cttz.i8(i8 %ld4, i1 -1)
+  %cttz5  = call i8 @llvm.cttz.i8(i8 %ld5, i1 -1)
+  %cttz6  = call i8 @llvm.cttz.i8(i8 %ld6, i1 -1)
+  %cttz7  = call i8 @llvm.cttz.i8(i8 %ld7, i1 -1)
+  %cttz8  = call i8 @llvm.cttz.i8(i8 %ld8, i1 -1)
+  %cttz9  = call i8 @llvm.cttz.i8(i8 %ld9, i1 -1)
+  %cttz10 = call i8 @llvm.cttz.i8(i8 %ld10, i1 -1)
+  %cttz11 = call i8 @llvm.cttz.i8(i8 %ld11, i1 -1)
+  %cttz12 = call i8 @llvm.cttz.i8(i8 %ld12, i1 -1)
+  %cttz13 = call i8 @llvm.cttz.i8(i8 %ld13, i1 -1)
+  %cttz14 = call i8 @llvm.cttz.i8(i8 %ld14, i1 -1)
+  %cttz15 = call i8 @llvm.cttz.i8(i8 %ld15, i1 -1)
+  store i8 %cttz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %cttz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %cttz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %cttz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %cttz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %cttz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %cttz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %cttz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %cttz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %cttz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %cttz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %cttz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %cttz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  ret void
+}
+
+define void @cttz_undef_32i8() #0 {
+; CHECK-LABEL: @cttz_undef_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 true)
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
+  %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
+  %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
+  %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
+  %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
+  %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
+  %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
+  %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
+  %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
+  %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
+  %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
+  %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
+  %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
+  %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
+  %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
+  %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
+  %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
+  %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
+  %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
+  %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
+  %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
+  %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
+  %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
+  %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
+  %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
+  %cttz0  = call i8 @llvm.cttz.i8(i8 %ld0, i1 -1)
+  %cttz1  = call i8 @llvm.cttz.i8(i8 %ld1, i1 -1)
+  %cttz2  = call i8 @llvm.cttz.i8(i8 %ld2, i1 -1)
+  %cttz3  = call i8 @llvm.cttz.i8(i8 %ld3, i1 -1)
+  %cttz4  = call i8 @llvm.cttz.i8(i8 %ld4, i1 -1)
+  %cttz5  = call i8 @llvm.cttz.i8(i8 %ld5, i1 -1)
+  %cttz6  = call i8 @llvm.cttz.i8(i8 %ld6, i1 -1)
+  %cttz7  = call i8 @llvm.cttz.i8(i8 %ld7, i1 -1)
+  %cttz8  = call i8 @llvm.cttz.i8(i8 %ld8, i1 -1)
+  %cttz9  = call i8 @llvm.cttz.i8(i8 %ld9, i1 -1)
+  %cttz10 = call i8 @llvm.cttz.i8(i8 %ld10, i1 -1)
+  %cttz11 = call i8 @llvm.cttz.i8(i8 %ld11, i1 -1)
+  %cttz12 = call i8 @llvm.cttz.i8(i8 %ld12, i1 -1)
+  %cttz13 = call i8 @llvm.cttz.i8(i8 %ld13, i1 -1)
+  %cttz14 = call i8 @llvm.cttz.i8(i8 %ld14, i1 -1)
+  %cttz15 = call i8 @llvm.cttz.i8(i8 %ld15, i1 -1)
+  %cttz16 = call i8 @llvm.cttz.i8(i8 %ld16, i1 -1)
+  %cttz17 = call i8 @llvm.cttz.i8(i8 %ld17, i1 -1)
+  %cttz18 = call i8 @llvm.cttz.i8(i8 %ld18, i1 -1)
+  %cttz19 = call i8 @llvm.cttz.i8(i8 %ld19, i1 -1)
+  %cttz20 = call i8 @llvm.cttz.i8(i8 %ld20, i1 -1)
+  %cttz21 = call i8 @llvm.cttz.i8(i8 %ld21, i1 -1)
+  %cttz22 = call i8 @llvm.cttz.i8(i8 %ld22, i1 -1)
+  %cttz23 = call i8 @llvm.cttz.i8(i8 %ld23, i1 -1)
+  %cttz24 = call i8 @llvm.cttz.i8(i8 %ld24, i1 -1)
+  %cttz25 = call i8 @llvm.cttz.i8(i8 %ld25, i1 -1)
+  %cttz26 = call i8 @llvm.cttz.i8(i8 %ld26, i1 -1)
+  %cttz27 = call i8 @llvm.cttz.i8(i8 %ld27, i1 -1)
+  %cttz28 = call i8 @llvm.cttz.i8(i8 %ld28, i1 -1)
+  %cttz29 = call i8 @llvm.cttz.i8(i8 %ld29, i1 -1)
+  %cttz30 = call i8 @llvm.cttz.i8(i8 %ld30, i1 -1)
+  %cttz31 = call i8 @llvm.cttz.i8(i8 %ld31, i1 -1)
+  store i8 %cttz0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
+  store i8 %cttz1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
+  store i8 %cttz2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
+  store i8 %cttz3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
+  store i8 %cttz4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
+  store i8 %cttz5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
+  store i8 %cttz6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
+  store i8 %cttz7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
+  store i8 %cttz8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
+  store i8 %cttz9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
+  store i8 %cttz10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
+  store i8 %cttz11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
+  store i8 %cttz12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
+  store i8 %cttz13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
+  store i8 %cttz14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
+  store i8 %cttz15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
+  store i8 %cttz16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
+  store i8 %cttz17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
+  store i8 %cttz18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
+  store i8 %cttz19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
+  store i8 %cttz20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
+  store i8 %cttz21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
+  store i8 %cttz22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
+  store i8 %cttz23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
+  store i8 %cttz24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
+  store i8 %cttz25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
+  store i8 %cttz26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
+  store i8 %cttz27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
+  store i8 %cttz28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
+  store i8 %cttz29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
+  store i8 %cttz30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
+  store i8 %cttz31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
new file mode 100644
index 00000000000..2ba0a15fed2
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; int foo(int *A) {
+;   int r = A[0], g = A[1], b = A[2], a = A[3];
+;   for (int i=0; i < A[13]; i++) {
+;     r*=18; g*=19; b*=12; a *=9;
+;   }
+;   A[0] = r; A[1] = g; A[2] = b; A[3] = a;
+; }
+
+define i32 @foo(i32* nocapture %A) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[CMP24:%.*]] = icmp sgt i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[CMP24]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ [[TMP4:%.*]], [[FOR_BODY]] ], [ [[TMP1]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP4]] = mul nsw <4 x i32> [[TMP3]], <i32 18, i32 19, i32 12, i32 9>
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_029]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <4 x i32> [ [[TMP1]], [[ENTRY]] ], [ [[TMP4]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %A, i64 1
+  %1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 2
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 3
+  %3 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 13
+  %4 = load i32, i32* %arrayidx4, align 4
+  %cmp24 = icmp sgt i32 %4, 0
+  br i1 %cmp24, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.029 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %a.028 = phi i32 [ %mul7, %for.body ], [ %3, %entry ]
+  %b.027 = phi i32 [ %mul6, %for.body ], [ %2, %entry ]
+  %g.026 = phi i32 [ %mul5, %for.body ], [ %1, %entry ]
+  %r.025 = phi i32 [ %mul, %for.body ], [ %0, %entry ]
+  %mul = mul nsw i32 %r.025, 18
+  %mul5 = mul nsw i32 %g.026, 19
+  %mul6 = mul nsw i32 %b.027, 12
+  %mul7 = mul nsw i32 %a.028, 9
+  %inc = add nsw i32 %i.029, 1
+  %cmp = icmp slt i32 %inc, %4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  %a.0.lcssa = phi i32 [ %3, %entry ], [ %mul7, %for.body ]
+  %b.0.lcssa = phi i32 [ %2, %entry ], [ %mul6, %for.body ]
+  %g.0.lcssa = phi i32 [ %1, %entry ], [ %mul5, %for.body ]
+  %r.0.lcssa = phi i32 [ %0, %entry ], [ %mul, %for.body ]
+  store i32 %r.0.lcssa, i32* %A, align 4
+  store i32 %g.0.lcssa, i32* %arrayidx1, align 4
+  store i32 %b.0.lcssa, i32* %arrayidx2, align 4
+  store i32 %a.0.lcssa, i32* %arrayidx3, align 4
+  ret i32 undef
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
new file mode 100644
index 00000000000..0fe399e725d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+; int depth(double *A, int m) {
+;   double y0 = 0; double y1 = 1;
+;   for (int i=0; i < m; i++) {
+;     y0 = A[4];
+;     y1 = A[5];
+;   }
+;   A[8] = y0; A[8+1] = y1;
+; }
+
+define i32 @depth(double* nocapture %A, i32 %m) #0 !dbg !4 {
+; CHECK-LABEL: @depth(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata double* [[A:%.*]], metadata !12, metadata !DIExpression()), !dbg !18
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[M:%.*]], metadata !13, metadata !DIExpression()), !dbg !18
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata double 0.000000e+00, metadata !14, metadata !DIExpression()), !dbg !19
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata double 2.000000e-01, metadata !15, metadata !DIExpression()), !dbg !19
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 0, metadata !16, metadata !DIExpression()), !dbg !20
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[M]], 0, !dbg !20
+; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]], !dbg !20
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 4, !dbg !21
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*, !dbg !21
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8, !dbg !21
+; CHECK-NEXT:    br label [[FOR_END]], !dbg !20
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[FOR_BODY_LR_PH]] ], [ <double 0.000000e+00, double 1.000000e+00>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 8, !dbg !23
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*, !dbg !23
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8, !dbg !23
+; CHECK-NEXT:    ret i32 undef, !dbg !24
+;
+entry:
+  tail call void @llvm.dbg.value(metadata double* %A, i64 0, metadata !12, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata i32 %m, i64 0, metadata !13, metadata !DIExpression()), !dbg !19
+  tail call void @llvm.dbg.value(metadata double 0.0, i64 0, metadata !14, metadata !DIExpression()), !dbg !21
+  tail call void @llvm.dbg.value(metadata double 0.2, i64 0, metadata !15, metadata !DIExpression()), !dbg !21
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !16, metadata !DIExpression()), !dbg !23
+  %cmp8 = icmp sgt i32 %m, 0, !dbg !23
+  br i1 %cmp8, label %for.body.lr.ph, label %for.end, !dbg !23
+
+for.body.lr.ph:                                   ; preds = %entry
+  %arrayidx = getelementptr inbounds double, double* %A, i64 4, !dbg !24
+  %0 = load double, double* %arrayidx, align 8, !dbg !24
+  %arrayidx1 = getelementptr inbounds double, double* %A, i64 5, !dbg !29
+  %1 = load double, double* %arrayidx1, align 8, !dbg !29
+  br label %for.end, !dbg !23
+
+for.end:                                          ; preds = %for.body.lr.ph, %entry
+  %y1.0.lcssa = phi double [ %1, %for.body.lr.ph ], [ 1.000000e+00, %entry ]
+  %y0.0.lcssa = phi double [ %0, %for.body.lr.ph ], [ 0.000000e+00, %entry ]
+  %arrayidx2 = getelementptr inbounds double, double* %A, i64 8, !dbg !30
+  store double %y0.0.lcssa, double* %arrayidx2, align 8, !dbg !30
+  %arrayidx3 = getelementptr inbounds double, double* %A, i64 9, !dbg !30
+  store double %y1.0.lcssa, double* %arrayidx3, align 8, !dbg !30
+  ret i32 undef, !dbg !31
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!18, !32}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 187335) (llvm/trunk 187335:187340M)", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "file.c", directory: "/Users/nadav")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "depth", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, retainedNodes: !11)
+!5 = !DIFile(filename: "file.c", directory: "/Users/nadav")
+!6 = !DISubroutineType(types: !7)
+!7 = !{!8, !9, !8}
+!8 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !10)
+!10 = !DIBasicType(tag: DW_TAG_base_type, name: "double", size: 64, align: 64, encoding: DW_ATE_float)
+!11 = !{!12, !13, !14, !15, !16}
+!12 = !DILocalVariable(name: "A", line: 1, arg: 1, scope: !4, file: !5, type: !9)
+!13 = !DILocalVariable(name: "m", line: 1, arg: 2, scope: !4, file: !5, type: !8)
+!14 = !DILocalVariable(name: "y0", line: 2, scope: !4, file: !5, type: !10)
+!15 = !DILocalVariable(name: "y1", line: 2, scope: !4, file: !5, type: !10)
+!16 = !DILocalVariable(name: "i", line: 3, scope: !17, file: !5, type: !8)
+!17 = distinct !DILexicalBlock(line: 3, column: 0, file: !1, scope: !4)
+!18 = !{i32 2, !"Dwarf Version", i32 2}
+!19 = !DILocation(line: 1, scope: !4)
+!20 = !{double 0.000000e+00}
+!21 = !DILocation(line: 2, scope: !4)
+!22 = !{double 1.000000e+00}
+!23 = !DILocation(line: 3, scope: !17)
+!24 = !DILocation(line: 4, scope: !25)
+!25 = distinct !DILexicalBlock(line: 3, column: 0, file: !1, scope: !17)
+!29 = !DILocation(line: 5, scope: !25)
+!30 = !DILocation(line: 7, scope: !4)
+!31 = !DILocation(line: 8, scope: !4)
+!32 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
new file mode 100644
index 00000000000..3cba8ea00f7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; int foo(int * restrict B,  int * restrict A, int n, int m) {
+;   B[0] = n * A[0] + m * A[0];
+;   B[1] = n * A[1] + m * A[1];
+;   B[2] = n * A[2] + m * A[2];
+;   B[3] = n * A[3] + m * A[3];
+;   return 0;
+; }
+
+define i32 @foo(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[MUL238]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[MUL238]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[MUL238]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  %mul238 = add i32 %m, %n
+  %add = mul i32 %0, %mul238
+  store i32 %add, i32* %B, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 1
+  %1 = load i32, i32* %arrayidx4, align 4
+  %add8 = mul i32 %1, %mul238
+  %arrayidx9 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %add8, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 2
+  %2 = load i32, i32* %arrayidx10, align 4
+  %add14 = mul i32 %2, %mul238
+  %arrayidx15 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %add14, i32* %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i32, i32* %A, i64 3
+  %3 = load i32, i32* %arrayidx16, align 4
+  %add20 = mul i32 %3, %mul238
+  %arrayidx21 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %add20, i32* %arrayidx21, align 4
+  ret i32 0
+}
+
+
+; int extr_user(int * restrict B,  int * restrict A, int n, int m) {
+;   B[0] = n * A[0] + m * A[0];
+;   B[1] = n * A[1] + m * A[1];
+;   B[2] = n * A[2] + m * A[2];
+;   B[3] = n * A[3] + m * A[3];
+;   return A[0];
+; }
+
+define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
+; CHECK-LABEL: @extr_user(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[MUL238]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[MUL238]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[MUL238]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  %mul238 = add i32 %m, %n
+  %add = mul i32 %0, %mul238
+  store i32 %add, i32* %B, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 1
+  %1 = load i32, i32* %arrayidx4, align 4
+  %add8 = mul i32 %1, %mul238
+  %arrayidx9 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %add8, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 2
+  %2 = load i32, i32* %arrayidx10, align 4
+  %add14 = mul i32 %2, %mul238
+  %arrayidx15 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %add14, i32* %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i32, i32* %A, i64 3
+  %3 = load i32, i32* %arrayidx16, align 4
+  %add20 = mul i32 %3, %mul238
+  %arrayidx21 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %add20, i32* %arrayidx21, align 4
+  ret i32 %0  ;<--------- This value has multiple users
+}
+
+; In this example we have an external user that is not the first element in the vector.
+define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
+; CHECK-LABEL: @extr_user1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[MUL238]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[MUL238]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[MUL238]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  %mul238 = add i32 %m, %n
+  %add = mul i32 %0, %mul238
+  store i32 %add, i32* %B, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 1
+  %1 = load i32, i32* %arrayidx4, align 4
+  %add8 = mul i32 %1, %mul238
+  %arrayidx9 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %add8, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %A, i64 2
+  %2 = load i32, i32* %arrayidx10, align 4
+  %add14 = mul i32 %2, %mul238
+  %arrayidx15 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %add14, i32* %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i32, i32* %A, i64 3
+  %3 = load i32, i32* %arrayidx16, align 4
+  %add20 = mul i32 %3, %mul238
+  %arrayidx21 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %add20, i32* %arrayidx21, align 4
+  ret i32 %1  ;<--------- This value has multiple users
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll
new file mode 100644
index 00000000000..1e47f7a51fd
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; double foo(double * restrict b,  double * restrict a, int n, int m) {
+;   double r=a[1];
+;   double g=a[0];
+;   double x;
+;   for (int i=0; i < 100; i++) {
+;     r += 10;
+;     g += 10;
+;     r *= 4;
+;     g *= 4;
+;     x = g; <----- external user!
+;     r += 4;
+;     g += 4;
+;   }
+;   b[0] = g;
+;   b[1] = r;
+;
+;   return x; <-- must extract here!
+; }
+
+define double @ext_user(double* noalias nocapture %B, double* noalias nocapture %A, i32 %n, i32 %m) {
+; CHECK-LABEL: @ext_user(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_020:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_020]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    ret double [[TMP7]]
+;
+entry:
+  %arrayidx = getelementptr inbounds double, double* %A, i64 1
+  %0 = load double, double* %arrayidx, align 8
+  %1 = load double, double* %A, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.020 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %G.019 = phi double [ %1, %entry ], [ %add5, %for.body ]
+  %R.018 = phi double [ %0, %entry ], [ %add4, %for.body ]
+  %add = fadd double %R.018, 1.000000e+01
+  %add2 = fadd double %G.019, 1.000000e+01
+  %mul = fmul double %add, 4.000000e+00
+  %mul3 = fmul double %add2, 4.000000e+00
+  %add4 = fadd double %mul, 4.000000e+00
+  %add5 = fadd double %mul3, 4.000000e+00
+  %inc = add nsw i32 %i.020, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store double %add5, double* %B, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %B, i64 1
+  store double %add4, double* %arrayidx7, align 8
+  ret double %mul3
+}
+
+; A need-to-gather entry cannot be an external use of the scalar element.
+; Instead the insertelement instructions of the need-to-gather entry are the
+; external users.
+; This test would assert because we would keep the scalar fpext and fadd alive.
+; PR18129
+
+define i32 @needtogather(double *noalias %a, i32 *noalias %b,  float * noalias %c,
+; CHECK-LABEL: @needtogather(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[D:%.*]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float 0.000000e+00, [[TMP1]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[SUB]], 0.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[CONV]], [[MUL]]
+; CHECK-NEXT:    [[CONV1:%.*]] = fpext float [[ADD]] to double
+; CHECK-NEXT:    [[SUB3:%.*]] = fsub float 1.000000e+00, [[TMP1]]
+; CHECK-NEXT:    [[MUL4:%.*]] = fmul float [[SUB3]], 0.000000e+00
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[CONV]], [[MUL4]]
+; CHECK-NEXT:    [[CONV6:%.*]] = fpext float [[ADD5]] to double
+; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[ADD]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi double [ [[CONV6]], [[IF_THEN]] ], [ [[CONV1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[E_0:%.*]] = phi double [ [[CONV1]], [[IF_THEN]] ], [ [[CONV6]], [[ENTRY]] ]
+; CHECK-NEXT:    store double [[STOREMERGE]], double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[CONV7:%.*]] = fptosi double [[E_0]] to i32
+; CHECK-NEXT:    store i32 [[CONV7]], i32* [[B:%.*]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  i32 * noalias %d) {
+entry:
+  %0 = load i32, i32* %d, align 4
+  %conv = sitofp i32 %0 to float
+  %1 = load float, float* %c
+  %sub = fsub float 0.000000e+00, %1
+  %mul = fmul float %sub, 0.000000e+00
+  %add = fadd float %conv, %mul
+  %conv1 = fpext float %add to double
+  %sub3 = fsub float 1.000000e+00, %1
+  %mul4 = fmul float %sub3, 0.000000e+00
+  %add5 = fadd float %conv, %mul4
+  %conv6 = fpext float %add5 to double
+  %tobool = fcmp une float %add, 0.000000e+00
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  br label %if.end
+
+if.end:
+  %storemerge = phi double [ %conv6, %if.then ], [ %conv1, %entry ]
+  %e.0 = phi double [ %conv1, %if.then ], [ %conv6, %entry ]
+  store double %storemerge, double* %a, align 8
+  %conv7 = fptosi double %e.0 to i32
+  store i32 %conv7, i32* %b, align 4
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
new file mode 100644
index 00000000000..a777b98852f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+@array = external global [20 x [13 x i32]]
+
+define void @hoge(i64 %idx, <4 x i32>* %sink) {
+; CHECK-LABEL: @hoge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX:%.*]], i64 5
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 6
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 7
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %0 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 5
+  %1 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 6
+  %2 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 7
+  %3 = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 %idx, i64 8
+  %4 = load i32, i32* %1, align 4
+  %5 = insertelement <4 x i32> undef, i32 %4, i32 0
+  %6 = load i32, i32* %2, align 4
+  %7 = insertelement <4 x i32> %5, i32 %6, i32 1
+  %8 = load i32, i32* %3, align 4
+  %9 = insertelement <4 x i32> %7, i32 %8, i32 2
+  %10 = load i32, i32* %0, align 4
+  %11 = insertelement <4 x i32> %9, i32 %10, i32 3
+  store <4 x i32> %11, <4 x i32>* %sink
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
new file mode 100644
index 00000000000..efce8825558
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-shuffle.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -slp-schedule-budget=1 | FileCheck %s
+
+define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[INS2]]
+;
+  %x0 = extractelement <2 x i8> %x, i32 0
+  %y1 = extractelement <2 x i8> %y, i32 1
+  %x0x0 = mul i8 %x0, %x0
+  %y1y1 = mul i8 %y1, %y1
+  %ins1 = insertelement <2 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1
+  ret <2 x i8> %ins2
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
new file mode 100644
index 00000000000..9a741cbb4cf
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+define void @fextr(double* %ptr) {
+; CHECK-LABEL: @fextr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* undef
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[LD]], <double 0.000000e+00, double 1.100000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %LD = load <2 x double>, <2 x double>* undef
+  %V0 = extractelement <2 x double> %LD, i32 0
+  %V1 = extractelement <2 x double> %LD, i32 1
+  %P0 = getelementptr inbounds double, double* %ptr, i64 0
+  %P1 = getelementptr inbounds double, double* %ptr, i64 1
+  %A0 = fadd double %V0, 0.0
+  %A1 = fadd double %V1, 1.1
+  store double %A0, double* %P0, align 4
+  store double %A1, double* %P1, align 4
+  ret void
+}
+
+define void @fextr1(double* %ptr) {
+; CHECK-LABEL: @fextr1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* undef
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], <double 3.400000e+00, double 1.200000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %LD = load <2 x double>, <2 x double>* undef
+  %V0 = extractelement <2 x double> %LD, i32 0
+  %V1 = extractelement <2 x double> %LD, i32 1
+  %P0 = getelementptr inbounds double, double* %ptr, i64 1  ; <--- incorrect order
+  %P1 = getelementptr inbounds double, double* %ptr, i64 0
+  %A0 = fadd double %V0, 1.2
+  %A1 = fadd double %V1, 3.4
+  store double %A0, double* %P0, align 4
+  store double %A1, double* %P1, align 4
+  ret void
+}
+
+define void @fextr2(double* %ptr) {
+; CHECK-LABEL: @fextr2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load <4 x double>, <4 x double>* undef
+; CHECK-NEXT:    [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0
+; CHECK-NEXT:    [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 5.500000e+00, double 6.600000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %LD = load <4 x double>, <4 x double>* undef
+  %V0 = extractelement <4 x double> %LD, i32 0  ; <--- invalid size.
+  %V1 = extractelement <4 x double> %LD, i32 1
+  %P0 = getelementptr inbounds double, double* %ptr, i64 0
+  %P1 = getelementptr inbounds double, double* %ptr, i64 1
+  %A0 = fadd double %V0, 5.5
+  %A1 = fadd double %V1, 6.6
+  store double %A0, double* %P0, align 4
+  store double %A1, double* %P1, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
new file mode 100644
index 00000000000..e7197f8b113
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.9.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = common global i64* null, align 8
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @fn1() {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64*, i64** @a, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64*> undef, i64* [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> <i64 11, i64 56>
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i64*, i64** @a, align 8
+  %add.ptr = getelementptr inbounds i64, i64* %0, i64 11
+  %1 = ptrtoint i64* %add.ptr to i64
+  store i64 %1, i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64, i64* %0, i64 56
+  %2 = ptrtoint i64* %add.ptr1 to i64
+  %arrayidx2 = getelementptr inbounds i64, i64* %0, i64 12
+  store i64 %2, i64* %arrayidx2, align 8
+  ret i32 undef
+}
+
+
+declare float @llvm.powi.f32(float, i32)
+define void @fn2(i32* %a, i32* %b, float* %c) {
+; CHECK-LABEL: @fn2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32(<4 x float> [[TMP5]], i32 [[TMP6]])
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %fp1 = sitofp i32 %add1 to float
+  %call1 = tail call float @llvm.powi.f32(float %fp1,i32 %add1) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
+  %i2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
+  %i3 = load i32, i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %fp2 = sitofp i32 %add2 to float
+  %call2 = tail call float @llvm.powi.f32(float %fp2,i32 %add1) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
+  %i4 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
+  %i5 = load i32, i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %fp3 = sitofp i32 %add3 to float
+  %call3 = tail call float @llvm.powi.f32(float %fp3,i32 %add1) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
+  %i6 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
+  %i7 = load i32, i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %fp4 = sitofp i32 %add4 to float
+  %call4 = tail call float @llvm.powi.f32(float %fp4,i32 %add1) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float, float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float, float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float, float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
new file mode 100644
index 00000000000..834f5a089b5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractcost.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @foo(i32* nocapture %A, i32 %n, i32 %m) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[N]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[N]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
+; CHECK-NEXT:    [[EXTERNALUSE1:%.*]] = add nsw i32 [[TMP9]], [[M:%.*]]
+; CHECK-NEXT:    [[EXTERNALUSE2:%.*]] = mul nsw i32 [[TMP9]], [[M]]
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[EXTERNALUSE1]], [[EXTERNALUSE2]]
+; CHECK-NEXT:    ret i32 [[ADD10]]
+;
+entry:
+  %mul = mul nsw i32 %n, 5
+  %add = add nsw i32 %mul, 9
+  store i32 %add, i32* %A, align 4
+  %mul1 = mul nsw i32 %n, 9
+  %add2 = add nsw i32 %mul1, 9
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+  store i32 %add2, i32* %arrayidx3, align 4
+  %mul4 = shl i32 %n, 3
+  %add5 = add nsw i32 %mul4, 9
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 2
+  store i32 %add5, i32* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %n, 10
+  %add8 = add nsw i32 %mul7, 9
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 3
+  store i32 %add8, i32* %arrayidx9, align 4
+  %externaluse1 = add nsw i32 %add, %m
+  %externaluse2 = mul nsw i32 %add, %m  ; we should add the extract cost only once and the store will be vectorized
+  %add10 = add nsw i32 %externaluse1, %externaluse2
+  ret i32 %add10
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll
new file mode 100644
index 00000000000..10675f3be8a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 -slp-threshold=-1 -slp-vectorize-hor-store | FileCheck %s --check-prefix=THRESH1
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 -slp-threshold=-2 -slp-vectorize-hor-store | FileCheck %s --check-prefix=THRESH2
+
+@a = global float 0.000000e+00, align 4
+
+define float @f(<2 x float> %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x0
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  ret float %add
+}
+
+define float @f_used_out_of_tree(<2 x float> %x) {
+; THRESH2-LABEL: @f_used_out_of_tree(
+; THRESH2-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; THRESH2-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]]
+; THRESH2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH2-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; THRESH2-NEXT:    store float [[ADD]], float* @a
+; THRESH2-NEXT:    ret float [[TMP1]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x0
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  store float %add, float* @a
+  ret float %x0
+}
+
+define float @f_used_twice_in_tree(<2 x float> %x) {
+; THRESH1-LABEL: @f_used_twice_in_tree(
+; THRESH1-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
+; THRESH1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; THRESH1-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; THRESH1-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[X]], [[TMP3]]
+; THRESH1-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; THRESH1-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; THRESH1-NEXT:    [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]]
+; THRESH1-NEXT:    ret float [[ADD]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x1
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  ret float %add
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll b/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll
new file mode 100644
index 00000000000..25394f48849
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Regression test from https://bugs.llvm.org/show_bug.cgi?id=39168
+; Based on code from `compiler-rt/lib/builtins/multc3.c`
+; On plaforms where fp128 lowers to an interger type (soft-fp) we
+; shouldn't be calling isFAbsFree() on the legalized type.
+
+; RUN: opt -slp-vectorizer -slp-threshold=-10 -S %s | FileCheck %s
+
+target triple = "i686-unknown-linux-gnu"
+
+define void @vectorize_fp128(fp128 %c, fp128 %d) #0 {
+; CHECK-LABEL: @vectorize_fp128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x fp128> undef, fp128 [[C:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x fp128> [[TMP0]], fp128 [[D:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <2 x fp128> [[TMP2]], <fp128 0xL00000000000000007FFF000000000000, fp128 0xL00000000000000007FFF000000000000>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK-NEXT:    [[OR_COND39:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[OR_COND39]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]]
+; CHECK:       if.then13:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end24:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call fp128 @llvm.fabs.f128(fp128 %c)
+  %cmpinf10 = fcmp oeq fp128 %0, 0xL00000000000000007FFF000000000000
+  %1 = tail call fp128 @llvm.fabs.f128(fp128 %d)
+  %cmpinf12 = fcmp oeq fp128 %1, 0xL00000000000000007FFF000000000000
+  %or.cond39 = or i1 %cmpinf10, %cmpinf12
+  br i1 %or.cond39, label %if.then13, label %if.end24
+
+if.then13:                                        ; preds = %entry
+  unreachable
+
+if.end24:                                         ; preds = %entry
+  ret void
+}
+
+declare fp128 @llvm.fabs.f128(fp128)
+
+attributes #0 = { "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
new file mode 100644
index 00000000000..101fd50e3e5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll
@@ -0,0 +1,275 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x double] zeroinitializer, align 64
+@src32 = common global [16 x float] zeroinitializer, align 64
+@dst64 = common global [8 x double] zeroinitializer, align 64
+@dst32 = common global [16 x float] zeroinitializer, align 64
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+
+;
+; FABS
+;
+
+define void @fabs_2f64() #0 {
+; CHECK-LABEL: @fabs_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %fabs0 = call double @llvm.fabs.f64(double %a0)
+  %fabs1 = call double @llvm.fabs.f64(double %a1)
+  store double %fabs0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %fabs1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @fabs_4f64() #0 {
+; SSE-LABEL: @fabs_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fabs_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %fabs0 = call double @llvm.fabs.f64(double %a0)
+  %fabs1 = call double @llvm.fabs.f64(double %a1)
+  %fabs2 = call double @llvm.fabs.f64(double %a2)
+  %fabs3 = call double @llvm.fabs.f64(double %a3)
+  store double %fabs0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %fabs1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %fabs2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %fabs3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @fabs_8f64() #0 {
+; SSE-LABEL: @fabs_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @fabs_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @fabs_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.fabs.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 4
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 4
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 4
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 4
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 4
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 4
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 4
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 4
+  %fabs0 = call double @llvm.fabs.f64(double %a0)
+  %fabs1 = call double @llvm.fabs.f64(double %a1)
+  %fabs2 = call double @llvm.fabs.f64(double %a2)
+  %fabs3 = call double @llvm.fabs.f64(double %a3)
+  %fabs4 = call double @llvm.fabs.f64(double %a4)
+  %fabs5 = call double @llvm.fabs.f64(double %a5)
+  %fabs6 = call double @llvm.fabs.f64(double %a6)
+  %fabs7 = call double @llvm.fabs.f64(double %a7)
+  store double %fabs0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
+  store double %fabs1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
+  store double %fabs2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
+  store double %fabs3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
+  store double %fabs4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
+  store double %fabs5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
+  store double %fabs6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
+  store double %fabs7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fabs_4f32() #0 {
+; CHECK-LABEL: @fabs_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %fabs0 = call float @llvm.fabs.f32(float %a0)
+  %fabs1 = call float @llvm.fabs.f32(float %a1)
+  %fabs2 = call float @llvm.fabs.f32(float %a2)
+  %fabs3 = call float @llvm.fabs.f32(float %a3)
+  store float %fabs0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %fabs1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %fabs2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %fabs3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @fabs_8f32() #0 {
+; SSE-LABEL: @fabs_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fabs_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %fabs0 = call float @llvm.fabs.f32(float %a0)
+  %fabs1 = call float @llvm.fabs.f32(float %a1)
+  %fabs2 = call float @llvm.fabs.f32(float %a2)
+  %fabs3 = call float @llvm.fabs.f32(float %a3)
+  %fabs4 = call float @llvm.fabs.f32(float %a4)
+  %fabs5 = call float @llvm.fabs.f32(float %a5)
+  %fabs6 = call float @llvm.fabs.f32(float %a6)
+  %fabs7 = call float @llvm.fabs.f32(float %a7)
+  store float %fabs0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %fabs1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %fabs2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %fabs3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %fabs4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %fabs5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %fabs6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %fabs7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fabs_16f32() #0 {
+; SSE-LABEL: @fabs_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @fabs_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @fabs_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.fabs.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  0), align 4
+  %a1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  1), align 4
+  %a2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  2), align 4
+  %a3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  3), align 4
+  %a4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  4), align 4
+  %a5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  5), align 4
+  %a6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  6), align 4
+  %a7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  7), align 4
+  %a8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  8), align 4
+  %a9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  9), align 4
+  %a10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %a11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %a12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %a13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %a14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %a15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %fabs0  = call float @llvm.fabs.f32(float %a0 )
+  %fabs1  = call float @llvm.fabs.f32(float %a1 )
+  %fabs2  = call float @llvm.fabs.f32(float %a2 )
+  %fabs3  = call float @llvm.fabs.f32(float %a3 )
+  %fabs4  = call float @llvm.fabs.f32(float %a4 )
+  %fabs5  = call float @llvm.fabs.f32(float %a5 )
+  %fabs6  = call float @llvm.fabs.f32(float %a6 )
+  %fabs7  = call float @llvm.fabs.f32(float %a7 )
+  %fabs8  = call float @llvm.fabs.f32(float %a8 )
+  %fabs9  = call float @llvm.fabs.f32(float %a9 )
+  %fabs10 = call float @llvm.fabs.f32(float %a10)
+  %fabs11 = call float @llvm.fabs.f32(float %a11)
+  %fabs12 = call float @llvm.fabs.f32(float %a12)
+  %fabs13 = call float @llvm.fabs.f32(float %a13)
+  %fabs14 = call float @llvm.fabs.f32(float %a14)
+  %fabs15 = call float @llvm.fabs.f32(float %a15)
+  store float %fabs0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  0), align 4
+  store float %fabs1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  1), align 4
+  store float %fabs2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  2), align 4
+  store float %fabs3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  3), align 4
+  store float %fabs4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  4), align 4
+  store float %fabs5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  5), align 4
+  store float %fabs6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  6), align 4
+  store float %fabs7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  7), align 4
+  store float %fabs8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  8), align 4
+  store float %fabs9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  9), align 4
+  store float %fabs10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %fabs11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %fabs12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %fabs13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %fabs14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %fabs15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
new file mode 100644
index 00000000000..b304fed3dc9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll
@@ -0,0 +1,343 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@srcA64 = common global [8 x double] zeroinitializer, align 64
+@srcB64 = common global [8 x double] zeroinitializer, align 64
+@srcC64 = common global [8 x double] zeroinitializer, align 64
+@srcA32 = common global [16 x float] zeroinitializer, align 64
+@srcB32 = common global [16 x float] zeroinitializer, align 64
+@srcC32 = common global [16 x float] zeroinitializer, align 64
+@dst64 = common global [8 x double] zeroinitializer, align 64
+@dst32 = common global [16 x float] zeroinitializer, align 64
+
+declare float @llvm.copysign.f32(float, float)
+declare double @llvm.copysign.f64(double, double)
+
+;
+; CHECK
+;
+
+define void @fcopysign_2f64() #0 {
+; CHECK-LABEL: @fcopysign_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
+  %b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
+  %b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
+  %fcopysign0 = call double @llvm.copysign.f64(double %a0, double %b0)
+  %fcopysign1 = call double @llvm.copysign.f64(double %a1, double %b1)
+  store double %fcopysign0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %fcopysign1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @fcopysign_4f64() #0 {
+; SSE-LABEL: @fcopysign_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fcopysign_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; AVX-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
+  %b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
+  %b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
+  %b2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
+  %b3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
+  %fcopysign0 = call double @llvm.copysign.f64(double %a0, double %b0)
+  %fcopysign1 = call double @llvm.copysign.f64(double %a1, double %b1)
+  %fcopysign2 = call double @llvm.copysign.f64(double %a2, double %b2)
+  %fcopysign3 = call double @llvm.copysign.f64(double %a3, double %b3)
+  store double %fcopysign0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %fcopysign1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %fcopysign2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %fcopysign3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @fcopysign_8f64() #0 {
+; SSE-LABEL: @fcopysign_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]])
+; SSE-NEXT:    store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @fcopysign_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]])
+; AVX256-NEXT:    store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @fcopysign_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.copysign.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]])
+; AVX512-NEXT:    store <8 x double> [[TMP3]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4
+  %b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4
+  %b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4
+  %b2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4
+  %b3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4
+  %b4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4
+  %b5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4
+  %b6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4
+  %b7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4
+  %fcopysign0 = call double @llvm.copysign.f64(double %a0, double %b0)
+  %fcopysign1 = call double @llvm.copysign.f64(double %a1, double %b1)
+  %fcopysign2 = call double @llvm.copysign.f64(double %a2, double %b2)
+  %fcopysign3 = call double @llvm.copysign.f64(double %a3, double %b3)
+  %fcopysign4 = call double @llvm.copysign.f64(double %a4, double %b4)
+  %fcopysign5 = call double @llvm.copysign.f64(double %a5, double %b5)
+  %fcopysign6 = call double @llvm.copysign.f64(double %a6, double %b6)
+  %fcopysign7 = call double @llvm.copysign.f64(double %a7, double %b7)
+  store double %fcopysign0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
+  store double %fcopysign1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
+  store double %fcopysign2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
+  store double %fcopysign3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
+  store double %fcopysign4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
+  store double %fcopysign5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
+  store double %fcopysign6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
+  store double %fcopysign7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fcopysign_4f32() #0 {
+; CHECK-LABEL: @fcopysign_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]])
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
+  %b0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
+  %b1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
+  %b2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
+  %b3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
+  %fcopysign0 = call float @llvm.copysign.f32(float %a0, float %b0)
+  %fcopysign1 = call float @llvm.copysign.f32(float %a1, float %b1)
+  %fcopysign2 = call float @llvm.copysign.f32(float %a2, float %b2)
+  %fcopysign3 = call float @llvm.copysign.f32(float %a3, float %b3)
+  store float %fcopysign0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %fcopysign1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %fcopysign2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %fcopysign3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @fcopysign_8f32() #0 {
+; SSE-LABEL: @fcopysign_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fcopysign_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; AVX-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
+  %b0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
+  %b1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
+  %b2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
+  %b3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
+  %b4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
+  %b5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
+  %b6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
+  %b7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
+  %fcopysign0 = call float @llvm.copysign.f32(float %a0, float %b0)
+  %fcopysign1 = call float @llvm.copysign.f32(float %a1, float %b1)
+  %fcopysign2 = call float @llvm.copysign.f32(float %a2, float %b2)
+  %fcopysign3 = call float @llvm.copysign.f32(float %a3, float %b3)
+  %fcopysign4 = call float @llvm.copysign.f32(float %a4, float %b4)
+  %fcopysign5 = call float @llvm.copysign.f32(float %a5, float %b5)
+  %fcopysign6 = call float @llvm.copysign.f32(float %a6, float %b6)
+  %fcopysign7 = call float @llvm.copysign.f32(float %a7, float %b7)
+  store float %fcopysign0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %fcopysign1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %fcopysign2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %fcopysign3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %fcopysign4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %fcopysign5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %fcopysign6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %fcopysign7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fcopysign_16f32() #0 {
+; SSE-LABEL: @fcopysign_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]])
+; SSE-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]])
+; SSE-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]])
+; SSE-NEXT:    [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]])
+; SSE-NEXT:    store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @fcopysign_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]])
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]])
+; AVX256-NEXT:    store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @fcopysign_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.copysign.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]])
+; AVX512-NEXT:    store <16 x float> [[TMP3]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  0), align 4
+  %a1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  1), align 4
+  %a2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  2), align 4
+  %a3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  3), align 4
+  %a4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  4), align 4
+  %a5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  5), align 4
+  %a6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  6), align 4
+  %a7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  7), align 4
+  %a8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  8), align 4
+  %a9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  9), align 4
+  %a10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4
+  %a11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4
+  %a12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4
+  %a13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4
+  %a14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4
+  %a15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4
+  %b0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  0), align 4
+  %b1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  1), align 4
+  %b2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  2), align 4
+  %b3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  3), align 4
+  %b4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  4), align 4
+  %b5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  5), align 4
+  %b6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  6), align 4
+  %b7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  7), align 4
+  %b8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  8), align 4
+  %b9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  9), align 4
+  %b10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4
+  %b11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4
+  %b12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4
+  %b13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4
+  %b14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4
+  %b15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4
+  %fcopysign0  = call float @llvm.copysign.f32(float %a0 , float %b0 )
+  %fcopysign1  = call float @llvm.copysign.f32(float %a1 , float %b1 )
+  %fcopysign2  = call float @llvm.copysign.f32(float %a2 , float %b2 )
+  %fcopysign3  = call float @llvm.copysign.f32(float %a3 , float %b3 )
+  %fcopysign4  = call float @llvm.copysign.f32(float %a4 , float %b4 )
+  %fcopysign5  = call float @llvm.copysign.f32(float %a5 , float %b5 )
+  %fcopysign6  = call float @llvm.copysign.f32(float %a6 , float %b6 )
+  %fcopysign7  = call float @llvm.copysign.f32(float %a7 , float %b7 )
+  %fcopysign8  = call float @llvm.copysign.f32(float %a8 , float %b8 )
+  %fcopysign9  = call float @llvm.copysign.f32(float %a9 , float %b9 )
+  %fcopysign10 = call float @llvm.copysign.f32(float %a10, float %b10)
+  %fcopysign11 = call float @llvm.copysign.f32(float %a11, float %b11)
+  %fcopysign12 = call float @llvm.copysign.f32(float %a12, float %b12)
+  %fcopysign13 = call float @llvm.copysign.f32(float %a13, float %b13)
+  %fcopysign14 = call float @llvm.copysign.f32(float %a14, float %b14)
+  %fcopysign15 = call float @llvm.copysign.f32(float %a15, float %b15)
+  store float %fcopysign0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  0), align 4
+  store float %fcopysign1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  1), align 4
+  store float %fcopysign2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  2), align 4
+  store float %fcopysign3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  3), align 4
+  store float %fcopysign4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  4), align 4
+  store float %fcopysign5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  5), align 4
+  store float %fcopysign6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  6), align 4
+  store float %fcopysign7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  7), align 4
+  store float %fcopysign8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  8), align 4
+  store float %fcopysign9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  9), align 4
+  store float %fcopysign10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %fcopysign11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %fcopysign12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %fcopysign13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %fcopysign14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %fcopysign15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/flag.ll b/llvm/test/Transforms/SLPVectorizer/X86/flag.ll
new file mode 100644
index 00000000000..e695c02ef10
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/flag.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=1000 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Check that the command line flag works.
+define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i64 %n) {
+; CHECK-LABEL: @rollable(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP26:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP4]], 7
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 7
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP7]], 7
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 14
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP10]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 21
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP13]], 7
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 28
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP23]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP11]]
+; CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26]] = add i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP26]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
+  %2 = shl i64 %i.019, 2
+  %3 = getelementptr inbounds i32, i32* %in, i64 %2
+  %4 = load i32, i32* %3, align 4
+  %5 = or i64 %2, 1
+  %6 = getelementptr inbounds i32, i32* %in, i64 %5
+  %7 = load i32, i32* %6, align 4
+  %8 = or i64 %2, 2
+  %9 = getelementptr inbounds i32, i32* %in, i64 %8
+  %10 = load i32, i32* %9, align 4
+  %11 = or i64 %2, 3
+  %12 = getelementptr inbounds i32, i32* %in, i64 %11
+  %13 = load i32, i32* %12, align 4
+  %14 = mul i32 %4, 7
+  %15 = add i32 %14, 7
+  %16 = mul i32 %7, 7
+  %17 = add i32 %16, 14
+  %18 = mul i32 %10, 7
+  %19 = add i32 %18, 21
+  %20 = mul i32 %13, 7
+  %21 = add i32 %20, 28
+  %22 = getelementptr inbounds i32, i32* %out, i64 %2
+  store i32 %15, i32* %22, align 4
+  %23 = getelementptr inbounds i32, i32* %out, i64 %5
+  store i32 %17, i32* %23, align 4
+  %24 = getelementptr inbounds i32, i32* %out, i64 %8
+  store i32 %19, i32* %24, align 4
+  %25 = getelementptr inbounds i32, i32* %out, i64 %11
+  store i32 %21, i32* %25, align 4
+  %26 = add i64 %i.019, 1
+  %exitcond = icmp eq i64 %26, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fma.ll b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
new file mode 100644
index 00000000000..95b6a71e545
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll
@@ -0,0 +1,563 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=NO-FMA
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=FMA --check-prefix=FMA256
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@srcA64 = common global [8 x double] zeroinitializer, align 64
+@srcB64 = common global [8 x double] zeroinitializer, align 64
+@srcC64 = common global [8 x double] zeroinitializer, align 64
+@srcA32 = common global [16 x float] zeroinitializer, align 64
+@srcB32 = common global [16 x float] zeroinitializer, align 64
+@srcC32 = common global [16 x float] zeroinitializer, align 64
+@dst64 = common global [8 x double] zeroinitializer, align 64
+@dst32 = common global [16 x float] zeroinitializer, align 64
+
+declare float @llvm.fma.f32(float, float, float)
+declare double @llvm.fma.f64(double, double, double)
+
+;
+; FMA
+;
+
+define void @fma_2f64() #0 {
+; NO-FMA-LABEL: @fma_2f64(
+; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
+; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    ret void
+;
+; FMA-LABEL: @fma_2f64(
+; FMA-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8
+; FMA-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8
+; FMA-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8
+; FMA-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
+; FMA-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; FMA-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
+  %b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
+  %b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
+  %c0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
+  %c1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
+  %fma0 = call double @llvm.fma.f64(double %a0, double %b0, double %c0)
+  %fma1 = call double @llvm.fma.f64(double %a1, double %b1, double %c1)
+  store double %fma0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %fma1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @fma_4f64() #0 {
+; NO-FMA-LABEL: @fma_4f64(
+; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
+; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
+; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
+; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
+; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8
+; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
+; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; NO-FMA-NEXT:    ret void
+;
+; FMA-LABEL: @fma_4f64(
+; FMA-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 8
+; FMA-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 8
+; FMA-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 8
+; FMA-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]])
+; FMA-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; FMA-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 8
+  %b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 8
+  %b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 8
+  %b2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 8
+  %b3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 8
+  %c0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 8
+  %c1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 8
+  %c2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 8
+  %c3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 8
+  %fma0 = call double @llvm.fma.f64(double %a0, double %b0, double %c0)
+  %fma1 = call double @llvm.fma.f64(double %a1, double %b1, double %c1)
+  %fma2 = call double @llvm.fma.f64(double %a2, double %b2, double %c2)
+  %fma3 = call double @llvm.fma.f64(double %a3, double %b3, double %c3)
+  store double %fma0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %fma1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %fma2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %fma3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @fma_8f64() #0 {
+; NO-FMA-LABEL: @fma_8f64(
+; NO-FMA-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[B0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[B1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[B2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[B3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[B4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[B5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[B6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[B7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[C0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[C1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[C2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[C3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[C4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[C5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[C6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[C7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call double @llvm.fma.f64(double [[A0]], double [[B0]], double [[C0]])
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call double @llvm.fma.f64(double [[A1]], double [[B1]], double [[C1]])
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call double @llvm.fma.f64(double [[A2]], double [[B2]], double [[C2]])
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call double @llvm.fma.f64(double [[A3]], double [[B3]], double [[C3]])
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call double @llvm.fma.f64(double [[A4]], double [[B4]], double [[C4]])
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call double @llvm.fma.f64(double [[A5]], double [[B5]], double [[C5]])
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call double @llvm.fma.f64(double [[A6]], double [[B6]], double [[C6]])
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call double @llvm.fma.f64(double [[A7]], double [[B7]], double [[C7]])
+; NO-FMA-NEXT:    store double [[FMA0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    store double [[FMA1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    store double [[FMA2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    store double [[FMA3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    store double [[FMA4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    store double [[FMA5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    store double [[FMA6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    store double [[FMA7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    ret void
+;
+; FMA256-LABEL: @fma_8f64(
+; FMA256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]])
+; FMA256-NEXT:    [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]])
+; FMA256-NEXT:    store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; FMA256-NEXT:    store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; FMA256-NEXT:    ret void
+;
+; FMA512-LABEL: @fma_8f64(
+; FMA512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcA64 to <8 x double>*), align 4
+; FMA512-NEXT:    [[TMP2:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcB64 to <8 x double>*), align 4
+; FMA512-NEXT:    [[TMP3:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @srcC64 to <8 x double>*), align 4
+; FMA512-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.fma.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]], <8 x double> [[TMP3]])
+; FMA512-NEXT:    store <8 x double> [[TMP4]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
+; FMA512-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 0), align 4
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 1), align 4
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2), align 4
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 3), align 4
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4), align 4
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 5), align 4
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6), align 4
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 7), align 4
+  %b0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 0), align 4
+  %b1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 1), align 4
+  %b2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2), align 4
+  %b3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 3), align 4
+  %b4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4), align 4
+  %b5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 5), align 4
+  %b6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6), align 4
+  %b7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 7), align 4
+  %c0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 0), align 4
+  %c1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 1), align 4
+  %c2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2), align 4
+  %c3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 3), align 4
+  %c4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4), align 4
+  %c5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 5), align 4
+  %c6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6), align 4
+  %c7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 7), align 4
+  %fma0 = call double @llvm.fma.f64(double %a0, double %b0, double %c0)
+  %fma1 = call double @llvm.fma.f64(double %a1, double %b1, double %c1)
+  %fma2 = call double @llvm.fma.f64(double %a2, double %b2, double %c2)
+  %fma3 = call double @llvm.fma.f64(double %a3, double %b3, double %c3)
+  %fma4 = call double @llvm.fma.f64(double %a4, double %b4, double %c4)
+  %fma5 = call double @llvm.fma.f64(double %a5, double %b5, double %c5)
+  %fma6 = call double @llvm.fma.f64(double %a6, double %b6, double %c6)
+  %fma7 = call double @llvm.fma.f64(double %a7, double %b7, double %c7)
+  store double %fma0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
+  store double %fma1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
+  store double %fma2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
+  store double %fma3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
+  store double %fma4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
+  store double %fma5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
+  store double %fma6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
+  store double %fma7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fma_4f32() #0 {
+; NO-FMA-LABEL: @fma_4f32(
+; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
+; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    ret void
+;
+; FMA-LABEL: @fma_4f32(
+; FMA-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4
+; FMA-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4
+; FMA-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4
+; FMA-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]])
+; FMA-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; FMA-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
+  %b0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
+  %b1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
+  %b2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
+  %b3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
+  %c0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
+  %c1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
+  %c2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
+  %c3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
+  %fma0 = call float @llvm.fma.f32(float %a0, float %b0, float %c0)
+  %fma1 = call float @llvm.fma.f32(float %a1, float %b1, float %c1)
+  %fma2 = call float @llvm.fma.f32(float %a2, float %b2, float %c2)
+  %fma3 = call float @llvm.fma.f32(float %a3, float %b3, float %c3)
+  store float %fma0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %fma1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %fma2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %fma3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @fma_8f32() #0 {
+; NO-FMA-LABEL: @fma_8f32(
+; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
+; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    ret void
+;
+; FMA-LABEL: @fma_8f32(
+; FMA-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; FMA-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]])
+; FMA-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
+  %b0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
+  %b1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
+  %b2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
+  %b3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
+  %b4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
+  %b5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
+  %b6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
+  %b7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
+  %c0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
+  %c1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
+  %c2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
+  %c3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
+  %c4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
+  %c5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
+  %c6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
+  %c7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
+  %fma0 = call float @llvm.fma.f32(float %a0, float %b0, float %c0)
+  %fma1 = call float @llvm.fma.f32(float %a1, float %b1, float %c1)
+  %fma2 = call float @llvm.fma.f32(float %a2, float %b2, float %c2)
+  %fma3 = call float @llvm.fma.f32(float %a3, float %b3, float %c3)
+  %fma4 = call float @llvm.fma.f32(float %a4, float %b4, float %c4)
+  %fma5 = call float @llvm.fma.f32(float %a5, float %b5, float %c5)
+  %fma6 = call float @llvm.fma.f32(float %a6, float %b6, float %c6)
+  %fma7 = call float @llvm.fma.f32(float %a7, float %b7, float %c7)
+  store float %fma0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %fma1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %fma2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %fma3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %fma4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %fma5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %fma6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %fma7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fma_16f32() #0 {
+; NO-FMA-LABEL: @fma_16f32(
+; NO-FMA-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[A8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8), align 4
+; NO-FMA-NEXT:    [[A9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 9), align 4
+; NO-FMA-NEXT:    [[A10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4
+; NO-FMA-NEXT:    [[A11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4
+; NO-FMA-NEXT:    [[A12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4
+; NO-FMA-NEXT:    [[A13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4
+; NO-FMA-NEXT:    [[A14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4
+; NO-FMA-NEXT:    [[A15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4
+; NO-FMA-NEXT:    [[B0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[B1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[B2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[B3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[B4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[B5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[B6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[B7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[B8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8), align 4
+; NO-FMA-NEXT:    [[B9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 9), align 4
+; NO-FMA-NEXT:    [[B10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4
+; NO-FMA-NEXT:    [[B11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4
+; NO-FMA-NEXT:    [[B12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4
+; NO-FMA-NEXT:    [[B13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4
+; NO-FMA-NEXT:    [[B14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4
+; NO-FMA-NEXT:    [[B15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4
+; NO-FMA-NEXT:    [[C0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    [[C1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    [[C2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    [[C3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    [[C4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    [[C5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    [[C6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    [[C7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    [[C8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8), align 4
+; NO-FMA-NEXT:    [[C9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 9), align 4
+; NO-FMA-NEXT:    [[C10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4
+; NO-FMA-NEXT:    [[C11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4
+; NO-FMA-NEXT:    [[C12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4
+; NO-FMA-NEXT:    [[C13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4
+; NO-FMA-NEXT:    [[C14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4
+; NO-FMA-NEXT:    [[C15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4
+; NO-FMA-NEXT:    [[FMA0:%.*]] = call float @llvm.fma.f32(float [[A0]], float [[B0]], float [[C0]])
+; NO-FMA-NEXT:    [[FMA1:%.*]] = call float @llvm.fma.f32(float [[A1]], float [[B1]], float [[C1]])
+; NO-FMA-NEXT:    [[FMA2:%.*]] = call float @llvm.fma.f32(float [[A2]], float [[B2]], float [[C2]])
+; NO-FMA-NEXT:    [[FMA3:%.*]] = call float @llvm.fma.f32(float [[A3]], float [[B3]], float [[C3]])
+; NO-FMA-NEXT:    [[FMA4:%.*]] = call float @llvm.fma.f32(float [[A4]], float [[B4]], float [[C4]])
+; NO-FMA-NEXT:    [[FMA5:%.*]] = call float @llvm.fma.f32(float [[A5]], float [[B5]], float [[C5]])
+; NO-FMA-NEXT:    [[FMA6:%.*]] = call float @llvm.fma.f32(float [[A6]], float [[B6]], float [[C6]])
+; NO-FMA-NEXT:    [[FMA7:%.*]] = call float @llvm.fma.f32(float [[A7]], float [[B7]], float [[C7]])
+; NO-FMA-NEXT:    [[FMA8:%.*]] = call float @llvm.fma.f32(float [[A8]], float [[B8]], float [[C8]])
+; NO-FMA-NEXT:    [[FMA9:%.*]] = call float @llvm.fma.f32(float [[A9]], float [[B9]], float [[C9]])
+; NO-FMA-NEXT:    [[FMA10:%.*]] = call float @llvm.fma.f32(float [[A10]], float [[B10]], float [[C10]])
+; NO-FMA-NEXT:    [[FMA11:%.*]] = call float @llvm.fma.f32(float [[A11]], float [[B11]], float [[C11]])
+; NO-FMA-NEXT:    [[FMA12:%.*]] = call float @llvm.fma.f32(float [[A12]], float [[B12]], float [[C12]])
+; NO-FMA-NEXT:    [[FMA13:%.*]] = call float @llvm.fma.f32(float [[A13]], float [[B13]], float [[C13]])
+; NO-FMA-NEXT:    [[FMA14:%.*]] = call float @llvm.fma.f32(float [[A14]], float [[B14]], float [[C14]])
+; NO-FMA-NEXT:    [[FMA15:%.*]] = call float @llvm.fma.f32(float [[A15]], float [[B15]], float [[C15]])
+; NO-FMA-NEXT:    store float [[FMA0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; NO-FMA-NEXT:    store float [[FMA1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; NO-FMA-NEXT:    store float [[FMA2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; NO-FMA-NEXT:    store float [[FMA3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; NO-FMA-NEXT:    store float [[FMA4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; NO-FMA-NEXT:    store float [[FMA5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; NO-FMA-NEXT:    store float [[FMA6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; NO-FMA-NEXT:    store float [[FMA7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; NO-FMA-NEXT:    store float [[FMA8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
+; NO-FMA-NEXT:    store float [[FMA9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; NO-FMA-NEXT:    store float [[FMA10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+; NO-FMA-NEXT:    store float [[FMA11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; NO-FMA-NEXT:    store float [[FMA12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+; NO-FMA-NEXT:    store float [[FMA13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; NO-FMA-NEXT:    store float [[FMA14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+; NO-FMA-NEXT:    store float [[FMA15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; NO-FMA-NEXT:    ret void
+;
+; FMA256-LABEL: @fma_16f32(
+; FMA256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]])
+; FMA256-NEXT:    [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; FMA256-NEXT:    store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; FMA256-NEXT:    store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; FMA256-NEXT:    ret void
+;
+; FMA512-LABEL: @fma_16f32(
+; FMA512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcA32 to <16 x float>*), align 4
+; FMA512-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcB32 to <16 x float>*), align 4
+; FMA512-NEXT:    [[TMP3:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @srcC32 to <16 x float>*), align 4
+; FMA512-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.fma.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]], <16 x float> [[TMP3]])
+; FMA512-NEXT:    store <16 x float> [[TMP4]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; FMA512-NEXT:    ret void
+;
+  %a0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  0), align 4
+  %a1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  1), align 4
+  %a2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  2), align 4
+  %a3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  3), align 4
+  %a4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  4), align 4
+  %a5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  5), align 4
+  %a6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  6), align 4
+  %a7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  7), align 4
+  %a8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  8), align 4
+  %a9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64  9), align 4
+  %a10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 10), align 4
+  %a11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 11), align 4
+  %a12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12), align 4
+  %a13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 13), align 4
+  %a14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 14), align 4
+  %a15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 15), align 4
+  %b0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  0), align 4
+  %b1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  1), align 4
+  %b2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  2), align 4
+  %b3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  3), align 4
+  %b4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  4), align 4
+  %b5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  5), align 4
+  %b6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  6), align 4
+  %b7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  7), align 4
+  %b8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  8), align 4
+  %b9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64  9), align 4
+  %b10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 10), align 4
+  %b11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 11), align 4
+  %b12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12), align 4
+  %b13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 13), align 4
+  %b14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 14), align 4
+  %b15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 15), align 4
+  %c0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  0), align 4
+  %c1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  1), align 4
+  %c2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  2), align 4
+  %c3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  3), align 4
+  %c4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  4), align 4
+  %c5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  5), align 4
+  %c6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  6), align 4
+  %c7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  7), align 4
+  %c8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  8), align 4
+  %c9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64  9), align 4
+  %c10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 10), align 4
+  %c11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 11), align 4
+  %c12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12), align 4
+  %c13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 13), align 4
+  %c14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 14), align 4
+  %c15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 15), align 4
+  %fma0  = call float @llvm.fma.f32(float %a0 , float %b0 , float %c0 )
+  %fma1  = call float @llvm.fma.f32(float %a1 , float %b1 , float %c1 )
+  %fma2  = call float @llvm.fma.f32(float %a2 , float %b2 , float %c2 )
+  %fma3  = call float @llvm.fma.f32(float %a3 , float %b3 , float %c3 )
+  %fma4  = call float @llvm.fma.f32(float %a4 , float %b4 , float %c4 )
+  %fma5  = call float @llvm.fma.f32(float %a5 , float %b5 , float %c5 )
+  %fma6  = call float @llvm.fma.f32(float %a6 , float %b6 , float %c6 )
+  %fma7  = call float @llvm.fma.f32(float %a7 , float %b7 , float %c7 )
+  %fma8  = call float @llvm.fma.f32(float %a8 , float %b8 , float %c8 )
+  %fma9  = call float @llvm.fma.f32(float %a9 , float %b9 , float %c9 )
+  %fma10 = call float @llvm.fma.f32(float %a10, float %b10, float %c10)
+  %fma11 = call float @llvm.fma.f32(float %a11, float %b11, float %c11)
+  %fma12 = call float @llvm.fma.f32(float %a12, float %b12, float %c12)
+  %fma13 = call float @llvm.fma.f32(float %a13, float %b13, float %c13)
+  %fma14 = call float @llvm.fma.f32(float %a14, float %b14, float %c14)
+  %fma15 = call float @llvm.fma.f32(float %a15, float %b15, float %c15)
+  store float %fma0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  0), align 4
+  store float %fma1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  1), align 4
+  store float %fma2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  2), align 4
+  store float %fma3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  3), align 4
+  store float %fma4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  4), align 4
+  store float %fma5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  5), align 4
+  store float %fma6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  6), align 4
+  store float %fma7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  7), align 4
+  store float %fma8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  8), align 4
+  store float %fma9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  9), align 4
+  store float %fma10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %fma11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %fma12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %fma13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %fma14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %fma15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
new file mode 100644
index 00000000000..69b9f32b733
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll
@@ -0,0 +1,561 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x double] zeroinitializer, align 64
+@src32 = common global [16 x float] zeroinitializer, align 64
+@dst64 = common global [8 x i64] zeroinitializer, align 64
+@dst32 = common global [16 x i32] zeroinitializer, align 64
+@dst16 = common global [32 x i16] zeroinitializer, align 64
+@dst8 = common global [64 x i8] zeroinitializer, align 64
+
+;
+; FPTOSI vXf64
+;
+
+define void @fptosi_8f64_8i64() #0 {
+; SSE-LABEL: @fptosi_8f64_8i64(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
+; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptosi_8f64_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f64_8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i64>
+; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f64_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i64
+  %cvt1 = fptosi double %a1 to i64
+  %cvt2 = fptosi double %a2 to i64
+  %cvt3 = fptosi double %a3 to i64
+  %cvt4 = fptosi double %a4 to i64
+  %cvt5 = fptosi double %a5 to i64
+  %cvt6 = fptosi double %a6 to i64
+  %cvt7 = fptosi double %a7 to i64
+  store i64 %cvt0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cvt1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+  store i64 %cvt2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+  store i64 %cvt3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+  store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+  store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+  store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+  store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @fptosi_8f64_8i32() #0 {
+; SSE-LABEL: @fptosi_8f64_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptosi_8f64_8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i32>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i32
+  %cvt1 = fptosi double %a1 to i32
+  %cvt2 = fptosi double %a2 to i32
+  %cvt3 = fptosi double %a3 to i32
+  %cvt4 = fptosi double %a4 to i32
+  %cvt5 = fptosi double %a5 to i32
+  %cvt6 = fptosi double %a6 to i32
+  %cvt7 = fptosi double %a7 to i32
+  store i32 %cvt0, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cvt1, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cvt2, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cvt3, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+  store i32 %cvt4, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+  store i32 %cvt5, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+  store i32 %cvt6, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+  store i32 %cvt7, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fptosi_8f64_8i16() #0 {
+; SSE-LABEL: @fptosi_8f64_8i16(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i16
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i16
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i16
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i16
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i16
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i16
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i16
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i16
+; SSE-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptosi_8f64_8i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i16>
+; AVX-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i16
+  %cvt1 = fptosi double %a1 to i16
+  %cvt2 = fptosi double %a2 to i16
+  %cvt3 = fptosi double %a3 to i16
+  %cvt4 = fptosi double %a4 to i16
+  %cvt5 = fptosi double %a5 to i16
+  %cvt6 = fptosi double %a6 to i16
+  %cvt7 = fptosi double %a7 to i16
+  store i16 %cvt0, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+  store i16 %cvt1, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+  store i16 %cvt2, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+  store i16 %cvt3, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+  store i16 %cvt4, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+  store i16 %cvt5, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+  store i16 %cvt6, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+  store i16 %cvt7, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @fptosi_8f64_8i8() #0 {
+; CHECK-LABEL: @fptosi_8f64_8i8(
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0]] to i8
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1]] to i8
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2]] to i8
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3]] to i8
+; CHECK-NEXT:    [[CVT4:%.*]] = fptosi double [[A4]] to i8
+; CHECK-NEXT:    [[CVT5:%.*]] = fptosi double [[A5]] to i8
+; CHECK-NEXT:    [[CVT6:%.*]] = fptosi double [[A6]] to i8
+; CHECK-NEXT:    [[CVT7:%.*]] = fptosi double [[A7]] to i8
+; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptosi double %a0 to i8
+  %cvt1 = fptosi double %a1 to i8
+  %cvt2 = fptosi double %a2 to i8
+  %cvt3 = fptosi double %a3 to i8
+  %cvt4 = fptosi double %a4 to i8
+  %cvt5 = fptosi double %a5 to i8
+  %cvt6 = fptosi double %a6 to i8
+  %cvt7 = fptosi double %a7 to i8
+  store i8 %cvt0, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+  store i8 %cvt1, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+  store i8 %cvt2, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+  store i8 %cvt3, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+  store i8 %cvt4, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+  store i8 %cvt5, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+  store i8 %cvt6, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+  store i8 %cvt7, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+  ret void
+}
+
+;
+; FPTOSI vXf32
+;
+
+define void @fptosi_8f32_8i64() #0 {
+; SSE-LABEL: @fptosi_8f32_8i64(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
+; SSE-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
+; SSE-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
+; SSE-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
+; SSE-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
+; SSE-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
+; SSE-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
+; SSE-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
+; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptosi_8f32_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptosi_8f32_8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i64>
+; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptosi_8f32_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i64
+  %cvt1 = fptosi float %a1 to i64
+  %cvt2 = fptosi float %a2 to i64
+  %cvt3 = fptosi float %a3 to i64
+  %cvt4 = fptosi float %a4 to i64
+  %cvt5 = fptosi float %a5 to i64
+  %cvt6 = fptosi float %a6 to i64
+  %cvt7 = fptosi float %a7 to i64
+  store i64 %cvt0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cvt1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+  store i64 %cvt2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+  store i64 %cvt3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+  store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+  store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+  store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+  store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @fptosi_8f32_8i32() #0 {
+; SSE-LABEL: @fptosi_8f32_8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32>
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptosi_8f32_8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i32>
+; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i32
+  %cvt1 = fptosi float %a1 to i32
+  %cvt2 = fptosi float %a2 to i32
+  %cvt3 = fptosi float %a3 to i32
+  %cvt4 = fptosi float %a4 to i32
+  %cvt5 = fptosi float %a5 to i32
+  %cvt6 = fptosi float %a6 to i32
+  %cvt7 = fptosi float %a7 to i32
+  store i32 %cvt0, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cvt1, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cvt2, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cvt3, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+  store i32 %cvt4, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+  store i32 %cvt5, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+  store i32 %cvt6, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+  store i32 %cvt7, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fptosi_8f32_8i16() #0 {
+; CHECK-LABEL: @fptosi_8f32_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i16
+  %cvt1 = fptosi float %a1 to i16
+  %cvt2 = fptosi float %a2 to i16
+  %cvt3 = fptosi float %a3 to i16
+  %cvt4 = fptosi float %a4 to i16
+  %cvt5 = fptosi float %a5 to i16
+  %cvt6 = fptosi float %a6 to i16
+  %cvt7 = fptosi float %a7 to i16
+  store i16 %cvt0, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+  store i16 %cvt1, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+  store i16 %cvt2, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+  store i16 %cvt3, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+  store i16 %cvt4, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+  store i16 %cvt5, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+  store i16 %cvt6, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+  store i16 %cvt7, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @fptosi_8f32_8i8() #0 {
+; CHECK-LABEL: @fptosi_8f32_8i8(
+; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0]] to i8
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1]] to i8
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2]] to i8
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3]] to i8
+; CHECK-NEXT:    [[CVT4:%.*]] = fptosi float [[A4]] to i8
+; CHECK-NEXT:    [[CVT5:%.*]] = fptosi float [[A5]] to i8
+; CHECK-NEXT:    [[CVT6:%.*]] = fptosi float [[A6]] to i8
+; CHECK-NEXT:    [[CVT7:%.*]] = fptosi float [[A7]] to i8
+; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptosi float %a0 to i8
+  %cvt1 = fptosi float %a1 to i8
+  %cvt2 = fptosi float %a2 to i8
+  %cvt3 = fptosi float %a3 to i8
+  %cvt4 = fptosi float %a4 to i8
+  %cvt5 = fptosi float %a5 to i8
+  %cvt6 = fptosi float %a6 to i8
+  %cvt7 = fptosi float %a7 to i8
+  store i8 %cvt0, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+  store i8 %cvt1, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+  store i8 %cvt2, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+  store i8 %cvt3, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+  store i8 %cvt4, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+  store i8 %cvt5, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+  store i8 %cvt6, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+  store i8 %cvt7, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+  ret void
+}
+
+;
+; FPTOSI BUILDVECTOR
+;
+
+define <4 x i32> @fptosi_4xf64_4i32(double %a0, double %a1, double %a2, double %a3) #0 {
+; CHECK-LABEL: @fptosi_4xf64_4i32(
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi double [[A0:%.*]] to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi double [[A1:%.*]] to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi double [[A2:%.*]] to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi double [[A3:%.*]] to i32
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %cvt0 = fptosi double %a0 to i32
+  %cvt1 = fptosi double %a1 to i32
+  %cvt2 = fptosi double %a2 to i32
+  %cvt3 = fptosi double %a3 to i32
+  %res0 = insertelement <4 x i32> undef, i32 %cvt0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %cvt1, i32 1
+  %res2 = insertelement <4 x i32> %res1, i32 %cvt2, i32 2
+  %res3 = insertelement <4 x i32> %res2, i32 %cvt3, i32 3
+  ret <4 x i32> %res3
+}
+
+define <4 x i32> @fptosi_4xf32_4i32(float %a0, float %a1, float %a2, float %a3) #0 {
+; CHECK-LABEL: @fptosi_4xf32_4i32(
+; CHECK-NEXT:    [[CVT0:%.*]] = fptosi float [[A0:%.*]] to i32
+; CHECK-NEXT:    [[CVT1:%.*]] = fptosi float [[A1:%.*]] to i32
+; CHECK-NEXT:    [[CVT2:%.*]] = fptosi float [[A2:%.*]] to i32
+; CHECK-NEXT:    [[CVT3:%.*]] = fptosi float [[A3:%.*]] to i32
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x i32> undef, i32 [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x i32> [[RES0]], i32 [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x i32> [[RES1]], i32 [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x i32> [[RES2]], i32 [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RES3]]
+;
+  %cvt0 = fptosi float %a0 to i32
+  %cvt1 = fptosi float %a1 to i32
+  %cvt2 = fptosi float %a2 to i32
+  %cvt3 = fptosi float %a3 to i32
+  %res0 = insertelement <4 x i32> undef, i32 %cvt0, i32 0
+  %res1 = insertelement <4 x i32> %res0, i32 %cvt1, i32 1
+  %res2 = insertelement <4 x i32> %res1, i32 %cvt2, i32 2
+  %res3 = insertelement <4 x i32> %res2, i32 %cvt3, i32 3
+  ret <4 x i32> %res3
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
new file mode 100644
index 00000000000..33d998ffbbc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll
@@ -0,0 +1,673 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x double] zeroinitializer, align 64
+@src32 = common global [16 x float] zeroinitializer, align 64
+@dst64 = common global [8 x i64] zeroinitializer, align 64
+@dst32 = common global [16 x i32] zeroinitializer, align 64
+@dst16 = common global [32 x i16] zeroinitializer, align 64
+@dst8 = common global [64 x i8] zeroinitializer, align 64
+
+;
+; FPTOUI vXf64
+;
+
+define void @fptoui_8f64_8i64() #0 {
+; SSE-LABEL: @fptoui_8f64_8i64(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
+; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptoui_8f64_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptoui_8f64_8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i64>
+; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f64_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptoui double %a0 to i64
+  %cvt1 = fptoui double %a1 to i64
+  %cvt2 = fptoui double %a2 to i64
+  %cvt3 = fptoui double %a3 to i64
+  %cvt4 = fptoui double %a4 to i64
+  %cvt5 = fptoui double %a5 to i64
+  %cvt6 = fptoui double %a6 to i64
+  %cvt7 = fptoui double %a7 to i64
+  store i64 %cvt0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cvt1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+  store i64 %cvt2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+  store i64 %cvt3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+  store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+  store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+  store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+  store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @fptoui_8f64_8i32() #0 {
+; SSE-LABEL: @fptoui_8f64_8i32(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
+; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptoui_8f64_8i32(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i32
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i32
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i32
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i32
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i32
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i32
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i32
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i32
+; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptoui_8f64_8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; AVX512-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f64_8i32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i32>
+; AVX256DQ-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptoui double %a0 to i32
+  %cvt1 = fptoui double %a1 to i32
+  %cvt2 = fptoui double %a2 to i32
+  %cvt3 = fptoui double %a3 to i32
+  %cvt4 = fptoui double %a4 to i32
+  %cvt5 = fptoui double %a5 to i32
+  %cvt6 = fptoui double %a6 to i32
+  %cvt7 = fptoui double %a7 to i32
+  store i32 %cvt0, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cvt1, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cvt2, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cvt3, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+  store i32 %cvt4, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+  store i32 %cvt5, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+  store i32 %cvt6, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+  store i32 %cvt7, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fptoui_8f64_8i16() #0 {
+; SSE-LABEL: @fptoui_8f64_8i16(
+; SSE-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i16
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i16
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i16
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i16
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i16
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i16
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i16
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i16
+; SSE-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptoui_8f64_8i16(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i16
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i16
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i16
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i16
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i16
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i16
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i16
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i16
+; AVX256NODQ-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+; AVX256NODQ-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptoui_8f64_8i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
+; AVX512-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f64_8i16(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i16>
+; AVX256DQ-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptoui double %a0 to i16
+  %cvt1 = fptoui double %a1 to i16
+  %cvt2 = fptoui double %a2 to i16
+  %cvt3 = fptoui double %a3 to i16
+  %cvt4 = fptoui double %a4 to i16
+  %cvt5 = fptoui double %a5 to i16
+  %cvt6 = fptoui double %a6 to i16
+  %cvt7 = fptoui double %a7 to i16
+  store i16 %cvt0, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+  store i16 %cvt1, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+  store i16 %cvt2, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+  store i16 %cvt3, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+  store i16 %cvt4, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+  store i16 %cvt5, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+  store i16 %cvt6, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+  store i16 %cvt7, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @fptoui_8f64_8i8() #0 {
+; CHECK-LABEL: @fptoui_8f64_8i8(
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; CHECK-NEXT:    [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; CHECK-NEXT:    [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; CHECK-NEXT:    [[CVT0:%.*]] = fptoui double [[A0]] to i8
+; CHECK-NEXT:    [[CVT1:%.*]] = fptoui double [[A1]] to i8
+; CHECK-NEXT:    [[CVT2:%.*]] = fptoui double [[A2]] to i8
+; CHECK-NEXT:    [[CVT3:%.*]] = fptoui double [[A3]] to i8
+; CHECK-NEXT:    [[CVT4:%.*]] = fptoui double [[A4]] to i8
+; CHECK-NEXT:    [[CVT5:%.*]] = fptoui double [[A5]] to i8
+; CHECK-NEXT:    [[CVT6:%.*]] = fptoui double [[A6]] to i8
+; CHECK-NEXT:    [[CVT7:%.*]] = fptoui double [[A7]] to i8
+; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %cvt0 = fptoui double %a0 to i8
+  %cvt1 = fptoui double %a1 to i8
+  %cvt2 = fptoui double %a2 to i8
+  %cvt3 = fptoui double %a3 to i8
+  %cvt4 = fptoui double %a4 to i8
+  %cvt5 = fptoui double %a5 to i8
+  %cvt6 = fptoui double %a6 to i8
+  %cvt7 = fptoui double %a7 to i8
+  store i8 %cvt0, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+  store i8 %cvt1, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+  store i8 %cvt2, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+  store i8 %cvt3, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+  store i8 %cvt4, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+  store i8 %cvt5, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+  store i8 %cvt6, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+  store i8 %cvt7, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+  ret void
+}
+
+;
+; FPTOUI vXf32
+;
+
+define void @fptoui_8f32_8i64() #0 {
+; SSE-LABEL: @fptoui_8f32_8i64(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
+; SSE-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptoui_8f32_8i64(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i64
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i64
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i64
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i64
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i64
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i64
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i64
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i64
+; AVX256NODQ-NEXT:    store i64 [[CVT0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store i64 [[CVT7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptoui_8f32_8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i64>
+; AVX512-NEXT:    store <8 x i64> [[TMP2]], <8 x i64>* bitcast ([8 x i64]* @dst64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f32_8i64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64>
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptoui float %a0 to i64
+  %cvt1 = fptoui float %a1 to i64
+  %cvt2 = fptoui float %a2 to i64
+  %cvt3 = fptoui float %a3 to i64
+  %cvt4 = fptoui float %a4 to i64
+  %cvt5 = fptoui float %a5 to i64
+  %cvt6 = fptoui float %a6 to i64
+  %cvt7 = fptoui float %a7 to i64
+  store i64 %cvt0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 0), align 8
+  store i64 %cvt1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 1), align 8
+  store i64 %cvt2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 2), align 8
+  store i64 %cvt3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 3), align 8
+  store i64 %cvt4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4), align 8
+  store i64 %cvt5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 5), align 8
+  store i64 %cvt6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 6), align 8
+  store i64 %cvt7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @fptoui_8f32_8i32() #0 {
+; SSE-LABEL: @fptoui_8f32_8i32(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
+; SSE-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @fptoui_8f32_8i32(
+; AVX256NODQ-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i32
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i32
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i32
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i32
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i32
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i32
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i32
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i32
+; AVX256NODQ-NEXT:    store i32 [[CVT0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+; AVX256NODQ-NEXT:    store i32 [[CVT7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @fptoui_8f32_8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32>
+; AVX512-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @fptoui_8f32_8i32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i32>
+; AVX256DQ-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @dst32 to <8 x i32>*), align 4
+; AVX256DQ-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptoui float %a0 to i32
+  %cvt1 = fptoui float %a1 to i32
+  %cvt2 = fptoui float %a2 to i32
+  %cvt3 = fptoui float %a3 to i32
+  %cvt4 = fptoui float %a4 to i32
+  %cvt5 = fptoui float %a5 to i32
+  %cvt6 = fptoui float %a6 to i32
+  %cvt7 = fptoui float %a7 to i32
+  store i32 %cvt0, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 0), align 4
+  store i32 %cvt1, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 1), align 4
+  store i32 %cvt2, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 2), align 4
+  store i32 %cvt3, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 3), align 4
+  store i32 %cvt4, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4), align 4
+  store i32 %cvt5, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 5), align 4
+  store i32 %cvt6, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 6), align 4
+  store i32 %cvt7, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @fptoui_8f32_8i16() #0 {
+; SSE-LABEL: @fptoui_8f32_8i16(
+; SSE-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i16
+; SSE-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i16
+; SSE-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i16
+; SSE-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i16
+; SSE-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i16
+; SSE-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i16
+; SSE-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i16
+; SSE-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i16
+; SSE-NEXT:    store i16 [[CVT0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[CVT1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[CVT2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[CVT3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[CVT4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[CVT5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[CVT6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[CVT7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @fptoui_8f32_8i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i16>
+; AVX-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @dst16 to <8 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptoui float %a0 to i16
+  %cvt1 = fptoui float %a1 to i16
+  %cvt2 = fptoui float %a2 to i16
+  %cvt3 = fptoui float %a3 to i16
+  %cvt4 = fptoui float %a4 to i16
+  %cvt5 = fptoui float %a5 to i16
+  %cvt6 = fptoui float %a6 to i16
+  %cvt7 = fptoui float %a7 to i16
+  store i16 %cvt0, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 0), align 2
+  store i16 %cvt1, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 1), align 2
+  store i16 %cvt2, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 2), align 2
+  store i16 %cvt3, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 3), align 2
+  store i16 %cvt4, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 4), align 2
+  store i16 %cvt5, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 5), align 2
+  store i16 %cvt6, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 6), align 2
+  store i16 %cvt7, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @dst16, i32 0, i64 7), align 2
+  ret void
+}
+
+define void @fptoui_8f32_8i8() #0 {
+; CHECK-LABEL: @fptoui_8f32_8i8(
+; CHECK-NEXT:    [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; CHECK-NEXT:    [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; CHECK-NEXT:    [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; CHECK-NEXT:    [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; CHECK-NEXT:    [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; CHECK-NEXT:    [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; CHECK-NEXT:    [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; CHECK-NEXT:    [[CVT0:%.*]] = fptoui float [[A0]] to i8
+; CHECK-NEXT:    [[CVT1:%.*]] = fptoui float [[A1]] to i8
+; CHECK-NEXT:    [[CVT2:%.*]] = fptoui float [[A2]] to i8
+; CHECK-NEXT:    [[CVT3:%.*]] = fptoui float [[A3]] to i8
+; CHECK-NEXT:    [[CVT4:%.*]] = fptoui float [[A4]] to i8
+; CHECK-NEXT:    [[CVT5:%.*]] = fptoui float [[A5]] to i8
+; CHECK-NEXT:    [[CVT6:%.*]] = fptoui float [[A6]] to i8
+; CHECK-NEXT:    [[CVT7:%.*]] = fptoui float [[A7]] to i8
+; CHECK-NEXT:    store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+; CHECK-NEXT:    store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+; CHECK-NEXT:    store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+; CHECK-NEXT:    store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+; CHECK-NEXT:    store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+; CHECK-NEXT:    store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+; CHECK-NEXT:    store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+; CHECK-NEXT:    store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %cvt0 = fptoui float %a0 to i8
+  %cvt1 = fptoui float %a1 to i8
+  %cvt2 = fptoui float %a2 to i8
+  %cvt3 = fptoui float %a3 to i8
+  %cvt4 = fptoui float %a4 to i8
+  %cvt5 = fptoui float %a5 to i8
+  %cvt6 = fptoui float %a6 to i8
+  %cvt7 = fptoui float %a7 to i8
+  store i8 %cvt0, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1
+  store i8 %cvt1, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1
+  store i8 %cvt2, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1
+  store i8 %cvt3, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1
+  store i8 %cvt4, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1
+  store i8 %cvt5, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1
+  store i8 %cvt6, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1
+  store i8 %cvt7, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fround.ll b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
new file mode 100644
index 00000000000..071632b2156
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll
@@ -0,0 +1,2159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x double] zeroinitializer, align 64
+@dst64 = common global [8 x double] zeroinitializer, align 64
+@src32 = common global [16 x float] zeroinitializer, align 64
+@dst32 = common global [16 x float] zeroinitializer, align 64
+
+declare double @llvm.ceil.f64(double %p)
+declare double @llvm.floor.f64(double %p)
+declare double @llvm.nearbyint.f64(double %p)
+declare double @llvm.rint.f64(double %p)
+declare double @llvm.trunc.f64(double %p)
+
+declare float @llvm.ceil.f32(float %p)
+declare float @llvm.floor.f32(float %p)
+declare float @llvm.nearbyint.f32(float %p)
+declare float @llvm.rint.f32(float %p)
+declare float @llvm.trunc.f32(float %p)
+
+define void @ceil_2f64() #0 {
+; SSE2-LABEL: @ceil_2f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
+; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @ceil_2f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @ceil_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ceil0 = call double @llvm.ceil.f64(double %ld0)
+  %ceil1 = call double @llvm.ceil.f64(double %ld1)
+  store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @ceil_4f64() #0 {
+; SSE2-LABEL: @ceil_4f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
+; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]])
+; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
+; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @ceil_4f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @ceil_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %ceil0 = call double @llvm.ceil.f64(double %ld0)
+  %ceil1 = call double @llvm.ceil.f64(double %ld1)
+  %ceil2 = call double @llvm.ceil.f64(double %ld2)
+  %ceil3 = call double @llvm.ceil.f64(double %ld3)
+  store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %ceil2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %ceil3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @ceil_8f64() #0 {
+; SSE2-LABEL: @ceil_8f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[CEIL0:%.*]] = call double @llvm.ceil.f64(double [[LD0]])
+; SSE2-NEXT:    [[CEIL1:%.*]] = call double @llvm.ceil.f64(double [[LD1]])
+; SSE2-NEXT:    [[CEIL2:%.*]] = call double @llvm.ceil.f64(double [[LD2]])
+; SSE2-NEXT:    [[CEIL3:%.*]] = call double @llvm.ceil.f64(double [[LD3]])
+; SSE2-NEXT:    [[CEIL4:%.*]] = call double @llvm.ceil.f64(double [[LD4]])
+; SSE2-NEXT:    [[CEIL5:%.*]] = call double @llvm.ceil.f64(double [[LD5]])
+; SSE2-NEXT:    [[CEIL6:%.*]] = call double @llvm.ceil.f64(double [[LD6]])
+; SSE2-NEXT:    [[CEIL7:%.*]] = call double @llvm.ceil.f64(double [[LD7]])
+; SSE2-NEXT:    store double [[CEIL0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[CEIL1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[CEIL2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[CEIL3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store double [[CEIL4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store double [[CEIL5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store double [[CEIL6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store double [[CEIL7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @ceil_8f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @ceil_8f64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ceil_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ceil_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %ceil0 = call double @llvm.ceil.f64(double %ld0)
+  %ceil1 = call double @llvm.ceil.f64(double %ld1)
+  %ceil2 = call double @llvm.ceil.f64(double %ld2)
+  %ceil3 = call double @llvm.ceil.f64(double %ld3)
+  %ceil4 = call double @llvm.ceil.f64(double %ld4)
+  %ceil5 = call double @llvm.ceil.f64(double %ld5)
+  %ceil6 = call double @llvm.ceil.f64(double %ld6)
+  %ceil7 = call double @llvm.ceil.f64(double %ld7)
+  store double %ceil0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %ceil1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %ceil2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %ceil3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %ceil4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+  store double %ceil5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %ceil6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+  store double %ceil7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @floor_2f64() #0 {
+; SSE2-LABEL: @floor_2f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
+; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @floor_2f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @floor_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %floor0 = call double @llvm.floor.f64(double %ld0)
+  %floor1 = call double @llvm.floor.f64(double %ld1)
+  store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @floor_4f64() #0 {
+; SSE2-LABEL: @floor_4f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]])
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
+; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @floor_4f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @floor_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %floor0 = call double @llvm.floor.f64(double %ld0)
+  %floor1 = call double @llvm.floor.f64(double %ld1)
+  %floor2 = call double @llvm.floor.f64(double %ld2)
+  %floor3 = call double @llvm.floor.f64(double %ld3)
+  store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %floor2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %floor3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @floor_8f64() #0 {
+; SSE2-LABEL: @floor_8f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call double @llvm.floor.f64(double [[LD0]])
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call double @llvm.floor.f64(double [[LD1]])
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call double @llvm.floor.f64(double [[LD2]])
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call double @llvm.floor.f64(double [[LD3]])
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call double @llvm.floor.f64(double [[LD4]])
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call double @llvm.floor.f64(double [[LD5]])
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call double @llvm.floor.f64(double [[LD6]])
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call double @llvm.floor.f64(double [[LD7]])
+; SSE2-NEXT:    store double [[FLOOR0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[FLOOR1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[FLOOR2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[FLOOR3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store double [[FLOOR4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store double [[FLOOR5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store double [[FLOOR6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store double [[FLOOR7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @floor_8f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @floor_8f64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @floor_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @floor_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %floor0 = call double @llvm.floor.f64(double %ld0)
+  %floor1 = call double @llvm.floor.f64(double %ld1)
+  %floor2 = call double @llvm.floor.f64(double %ld2)
+  %floor3 = call double @llvm.floor.f64(double %ld3)
+  %floor4 = call double @llvm.floor.f64(double %ld4)
+  %floor5 = call double @llvm.floor.f64(double %ld5)
+  %floor6 = call double @llvm.floor.f64(double %ld6)
+  %floor7 = call double @llvm.floor.f64(double %ld7)
+  store double %floor0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %floor1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %floor2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %floor3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %floor4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+  store double %floor5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %floor6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+  store double %floor7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @nearbyint_2f64() #0 {
+; SSE2-LABEL: @nearbyint_2f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
+; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @nearbyint_2f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @nearbyint_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0)
+  %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1)
+  store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @nearbyint_4f64() #0 {
+; SSE2-LABEL: @nearbyint_4f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]])
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
+; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @nearbyint_4f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @nearbyint_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0)
+  %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1)
+  %nearbyint2 = call double @llvm.nearbyint.f64(double %ld2)
+  %nearbyint3 = call double @llvm.nearbyint.f64(double %ld3)
+  store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %nearbyint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %nearbyint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @nearbyint_8f64() #0 {
+; SSE2-LABEL: @nearbyint_8f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call double @llvm.nearbyint.f64(double [[LD0]])
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call double @llvm.nearbyint.f64(double [[LD1]])
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call double @llvm.nearbyint.f64(double [[LD2]])
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call double @llvm.nearbyint.f64(double [[LD3]])
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call double @llvm.nearbyint.f64(double [[LD4]])
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call double @llvm.nearbyint.f64(double [[LD5]])
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call double @llvm.nearbyint.f64(double [[LD6]])
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call double @llvm.nearbyint.f64(double [[LD7]])
+; SSE2-NEXT:    store double [[NEARBYINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[NEARBYINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[NEARBYINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[NEARBYINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store double [[NEARBYINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store double [[NEARBYINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store double [[NEARBYINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store double [[NEARBYINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @nearbyint_8f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @nearbyint_8f64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @nearbyint_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @nearbyint_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %nearbyint0 = call double @llvm.nearbyint.f64(double %ld0)
+  %nearbyint1 = call double @llvm.nearbyint.f64(double %ld1)
+  %nearbyint2 = call double @llvm.nearbyint.f64(double %ld2)
+  %nearbyint3 = call double @llvm.nearbyint.f64(double %ld3)
+  %nearbyint4 = call double @llvm.nearbyint.f64(double %ld4)
+  %nearbyint5 = call double @llvm.nearbyint.f64(double %ld5)
+  %nearbyint6 = call double @llvm.nearbyint.f64(double %ld6)
+  %nearbyint7 = call double @llvm.nearbyint.f64(double %ld7)
+  store double %nearbyint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %nearbyint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %nearbyint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %nearbyint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %nearbyint4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+  store double %nearbyint5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %nearbyint6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+  store double %nearbyint7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @rint_2f64() #0 {
+; SSE2-LABEL: @rint_2f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
+; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @rint_2f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @rint_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %rint0 = call double @llvm.rint.f64(double %ld0)
+  %rint1 = call double @llvm.rint.f64(double %ld1)
+  store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @rint_4f64() #0 {
+; SSE2-LABEL: @rint_4f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
+; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]])
+; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
+; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @rint_4f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @rint_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %rint0 = call double @llvm.rint.f64(double %ld0)
+  %rint1 = call double @llvm.rint.f64(double %ld1)
+  %rint2 = call double @llvm.rint.f64(double %ld2)
+  %rint3 = call double @llvm.rint.f64(double %ld3)
+  store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %rint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %rint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @rint_8f64() #0 {
+; SSE2-LABEL: @rint_8f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[RINT0:%.*]] = call double @llvm.rint.f64(double [[LD0]])
+; SSE2-NEXT:    [[RINT1:%.*]] = call double @llvm.rint.f64(double [[LD1]])
+; SSE2-NEXT:    [[RINT2:%.*]] = call double @llvm.rint.f64(double [[LD2]])
+; SSE2-NEXT:    [[RINT3:%.*]] = call double @llvm.rint.f64(double [[LD3]])
+; SSE2-NEXT:    [[RINT4:%.*]] = call double @llvm.rint.f64(double [[LD4]])
+; SSE2-NEXT:    [[RINT5:%.*]] = call double @llvm.rint.f64(double [[LD5]])
+; SSE2-NEXT:    [[RINT6:%.*]] = call double @llvm.rint.f64(double [[LD6]])
+; SSE2-NEXT:    [[RINT7:%.*]] = call double @llvm.rint.f64(double [[LD7]])
+; SSE2-NEXT:    store double [[RINT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[RINT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[RINT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[RINT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store double [[RINT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store double [[RINT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store double [[RINT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store double [[RINT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @rint_8f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @rint_8f64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @rint_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @rint_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.rint.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %rint0 = call double @llvm.rint.f64(double %ld0)
+  %rint1 = call double @llvm.rint.f64(double %ld1)
+  %rint2 = call double @llvm.rint.f64(double %ld2)
+  %rint3 = call double @llvm.rint.f64(double %ld3)
+  %rint4 = call double @llvm.rint.f64(double %ld4)
+  %rint5 = call double @llvm.rint.f64(double %ld5)
+  %rint6 = call double @llvm.rint.f64(double %ld6)
+  %rint7 = call double @llvm.rint.f64(double %ld7)
+  store double %rint0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %rint1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %rint2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %rint3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %rint4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+  store double %rint5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %rint6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+  store double %rint7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @trunc_2f64() #0 {
+; SSE2-LABEL: @trunc_2f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
+; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @trunc_2f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @trunc_2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; AVX-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %trunc0 = call double @llvm.trunc.f64(double %ld0)
+  %trunc1 = call double @llvm.trunc.f64(double %ld1)
+  store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @trunc_4f64() #0 {
+; SSE2-LABEL: @trunc_4f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]])
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
+; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @trunc_4f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @trunc_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %trunc0 = call double @llvm.trunc.f64(double %ld0)
+  %trunc1 = call double @llvm.trunc.f64(double %ld1)
+  %trunc2 = call double @llvm.trunc.f64(double %ld2)
+  %trunc3 = call double @llvm.trunc.f64(double %ld3)
+  store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %trunc2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %trunc3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @trunc_8f64() #0 {
+; SSE2-LABEL: @trunc_8f64(
+; SSE2-NEXT:    [[LD0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+; SSE2-NEXT:    [[LD1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+; SSE2-NEXT:    [[LD2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+; SSE2-NEXT:    [[LD3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+; SSE2-NEXT:    [[LD4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+; SSE2-NEXT:    [[LD5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+; SSE2-NEXT:    [[LD6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+; SSE2-NEXT:    [[LD7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call double @llvm.trunc.f64(double [[LD0]])
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call double @llvm.trunc.f64(double [[LD1]])
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call double @llvm.trunc.f64(double [[LD2]])
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call double @llvm.trunc.f64(double [[LD3]])
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call double @llvm.trunc.f64(double [[LD4]])
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call double @llvm.trunc.f64(double [[LD5]])
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call double @llvm.trunc.f64(double [[LD6]])
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call double @llvm.trunc.f64(double [[LD7]])
+; SSE2-NEXT:    store double [[TRUNC0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+; SSE2-NEXT:    store double [[TRUNC1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE2-NEXT:    store double [[TRUNC2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+; SSE2-NEXT:    store double [[TRUNC3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE2-NEXT:    store double [[TRUNC4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+; SSE2-NEXT:    store double [[TRUNC5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE2-NEXT:    store double [[TRUNC6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+; SSE2-NEXT:    store double [[TRUNC7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @trunc_8f64(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]])
+; SSE41-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8
+; SSE41-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @trunc_8f64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
+; AVX1-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX1-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @trunc_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX2-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @trunc_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.trunc.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 8
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %ld1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %ld2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %ld3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %ld4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8
+  %ld5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8
+  %ld6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8
+  %ld7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8
+  %trunc0 = call double @llvm.trunc.f64(double %ld0)
+  %trunc1 = call double @llvm.trunc.f64(double %ld1)
+  %trunc2 = call double @llvm.trunc.f64(double %ld2)
+  %trunc3 = call double @llvm.trunc.f64(double %ld3)
+  %trunc4 = call double @llvm.trunc.f64(double %ld4)
+  %trunc5 = call double @llvm.trunc.f64(double %ld5)
+  %trunc6 = call double @llvm.trunc.f64(double %ld6)
+  %trunc7 = call double @llvm.trunc.f64(double %ld7)
+  store double %trunc0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %trunc1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %trunc2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %trunc3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %trunc4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 8
+  store double %trunc5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %trunc6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 8
+  store double %trunc7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @ceil_4f32() #0 {
+; SSE2-LABEL: @ceil_4f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
+; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @ceil_4f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @ceil_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %ceil0 = call float @llvm.ceil.f32(float %ld0)
+  %ceil1 = call float @llvm.ceil.f32(float %ld1)
+  %ceil2 = call float @llvm.ceil.f32(float %ld2)
+  %ceil3 = call float @llvm.ceil.f32(float %ld3)
+  store float %ceil0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %ceil1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %ceil2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %ceil3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @ceil_8f32() #0 {
+; SSE2-LABEL: @ceil_8f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
+; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]])
+; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]])
+; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]])
+; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]])
+; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @ceil_8f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @ceil_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %ceil0 = call float @llvm.ceil.f32(float %ld0)
+  %ceil1 = call float @llvm.ceil.f32(float %ld1)
+  %ceil2 = call float @llvm.ceil.f32(float %ld2)
+  %ceil3 = call float @llvm.ceil.f32(float %ld3)
+  %ceil4 = call float @llvm.ceil.f32(float %ld4)
+  %ceil5 = call float @llvm.ceil.f32(float %ld5)
+  %ceil6 = call float @llvm.ceil.f32(float %ld6)
+  %ceil7 = call float @llvm.ceil.f32(float %ld7)
+  store float %ceil0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %ceil1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %ceil2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %ceil3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %ceil4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %ceil5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %ceil6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %ceil7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @ceil_16f32() #0 {
+; SSE2-LABEL: @ceil_16f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
+; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
+; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[CEIL0:%.*]] = call float @llvm.ceil.f32(float [[LD0]])
+; SSE2-NEXT:    [[CEIL1:%.*]] = call float @llvm.ceil.f32(float [[LD1]])
+; SSE2-NEXT:    [[CEIL2:%.*]] = call float @llvm.ceil.f32(float [[LD2]])
+; SSE2-NEXT:    [[CEIL3:%.*]] = call float @llvm.ceil.f32(float [[LD3]])
+; SSE2-NEXT:    [[CEIL4:%.*]] = call float @llvm.ceil.f32(float [[LD4]])
+; SSE2-NEXT:    [[CEIL5:%.*]] = call float @llvm.ceil.f32(float [[LD5]])
+; SSE2-NEXT:    [[CEIL6:%.*]] = call float @llvm.ceil.f32(float [[LD6]])
+; SSE2-NEXT:    [[CEIL7:%.*]] = call float @llvm.ceil.f32(float [[LD7]])
+; SSE2-NEXT:    [[CEIL8:%.*]] = call float @llvm.ceil.f32(float [[LD8]])
+; SSE2-NEXT:    [[CEIL9:%.*]] = call float @llvm.ceil.f32(float [[LD9]])
+; SSE2-NEXT:    [[CEIL10:%.*]] = call float @llvm.ceil.f32(float [[LD10]])
+; SSE2-NEXT:    [[CEIL11:%.*]] = call float @llvm.ceil.f32(float [[LD11]])
+; SSE2-NEXT:    [[CEIL12:%.*]] = call float @llvm.ceil.f32(float [[LD12]])
+; SSE2-NEXT:    [[CEIL13:%.*]] = call float @llvm.ceil.f32(float [[LD13]])
+; SSE2-NEXT:    [[CEIL14:%.*]] = call float @llvm.ceil.f32(float [[LD14]])
+; SSE2-NEXT:    [[CEIL15:%.*]] = call float @llvm.ceil.f32(float [[LD15]])
+; SSE2-NEXT:    store float [[CEIL0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[CEIL1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[CEIL2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[CEIL3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[CEIL4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[CEIL5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[CEIL6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[CEIL7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    store float [[CEIL8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
+; SSE2-NEXT:    store float [[CEIL9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE2-NEXT:    store float [[CEIL10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+; SSE2-NEXT:    store float [[CEIL11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE2-NEXT:    store float [[CEIL12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+; SSE2-NEXT:    store float [[CEIL13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE2-NEXT:    store float [[CEIL14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+; SSE2-NEXT:    store float [[CEIL15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @ceil_16f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @ceil_16f32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ceil_16f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ceil_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
+  %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
+  %ld3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
+  %ld5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
+  %ld7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4
+  %ld9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %ceil0  = call float @llvm.ceil.f32(float %ld0 )
+  %ceil1  = call float @llvm.ceil.f32(float %ld1 )
+  %ceil2  = call float @llvm.ceil.f32(float %ld2 )
+  %ceil3  = call float @llvm.ceil.f32(float %ld3 )
+  %ceil4  = call float @llvm.ceil.f32(float %ld4 )
+  %ceil5  = call float @llvm.ceil.f32(float %ld5 )
+  %ceil6  = call float @llvm.ceil.f32(float %ld6 )
+  %ceil7  = call float @llvm.ceil.f32(float %ld7 )
+  %ceil8  = call float @llvm.ceil.f32(float %ld8 )
+  %ceil9  = call float @llvm.ceil.f32(float %ld9 )
+  %ceil10 = call float @llvm.ceil.f32(float %ld10)
+  %ceil11 = call float @llvm.ceil.f32(float %ld11)
+  %ceil12 = call float @llvm.ceil.f32(float %ld12)
+  %ceil13 = call float @llvm.ceil.f32(float %ld13)
+  %ceil14 = call float @llvm.ceil.f32(float %ld14)
+  %ceil15 = call float @llvm.ceil.f32(float %ld15)
+  store float %ceil0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4
+  store float %ceil1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %ceil2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4
+  store float %ceil3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %ceil4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4
+  store float %ceil5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %ceil6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4
+  store float %ceil7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %ceil8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4
+  store float %ceil9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %ceil10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %ceil11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %ceil12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %ceil13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %ceil14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %ceil15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @floor_4f32() #0 {
+; SSE2-LABEL: @floor_4f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
+; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @floor_4f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @floor_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %floor0 = call float @llvm.floor.f32(float %ld0)
+  %floor1 = call float @llvm.floor.f32(float %ld1)
+  %floor2 = call float @llvm.floor.f32(float %ld2)
+  %floor3 = call float @llvm.floor.f32(float %ld3)
+  store float %floor0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %floor1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %floor2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %floor3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @floor_8f32() #0 {
+; SSE2-LABEL: @floor_8f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]])
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]])
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]])
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]])
+; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @floor_8f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @floor_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %floor0 = call float @llvm.floor.f32(float %ld0)
+  %floor1 = call float @llvm.floor.f32(float %ld1)
+  %floor2 = call float @llvm.floor.f32(float %ld2)
+  %floor3 = call float @llvm.floor.f32(float %ld3)
+  %floor4 = call float @llvm.floor.f32(float %ld4)
+  %floor5 = call float @llvm.floor.f32(float %ld5)
+  %floor6 = call float @llvm.floor.f32(float %ld6)
+  %floor7 = call float @llvm.floor.f32(float %ld7)
+  store float %floor0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %floor1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %floor2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %floor3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %floor4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %floor5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %floor6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %floor7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @floor_16f32() #0 {
+; SSE2-LABEL: @floor_16f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
+; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
+; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[FLOOR0:%.*]] = call float @llvm.floor.f32(float [[LD0]])
+; SSE2-NEXT:    [[FLOOR1:%.*]] = call float @llvm.floor.f32(float [[LD1]])
+; SSE2-NEXT:    [[FLOOR2:%.*]] = call float @llvm.floor.f32(float [[LD2]])
+; SSE2-NEXT:    [[FLOOR3:%.*]] = call float @llvm.floor.f32(float [[LD3]])
+; SSE2-NEXT:    [[FLOOR4:%.*]] = call float @llvm.floor.f32(float [[LD4]])
+; SSE2-NEXT:    [[FLOOR5:%.*]] = call float @llvm.floor.f32(float [[LD5]])
+; SSE2-NEXT:    [[FLOOR6:%.*]] = call float @llvm.floor.f32(float [[LD6]])
+; SSE2-NEXT:    [[FLOOR7:%.*]] = call float @llvm.floor.f32(float [[LD7]])
+; SSE2-NEXT:    [[FLOOR8:%.*]] = call float @llvm.floor.f32(float [[LD8]])
+; SSE2-NEXT:    [[FLOOR9:%.*]] = call float @llvm.floor.f32(float [[LD9]])
+; SSE2-NEXT:    [[FLOOR10:%.*]] = call float @llvm.floor.f32(float [[LD10]])
+; SSE2-NEXT:    [[FLOOR11:%.*]] = call float @llvm.floor.f32(float [[LD11]])
+; SSE2-NEXT:    [[FLOOR12:%.*]] = call float @llvm.floor.f32(float [[LD12]])
+; SSE2-NEXT:    [[FLOOR13:%.*]] = call float @llvm.floor.f32(float [[LD13]])
+; SSE2-NEXT:    [[FLOOR14:%.*]] = call float @llvm.floor.f32(float [[LD14]])
+; SSE2-NEXT:    [[FLOOR15:%.*]] = call float @llvm.floor.f32(float [[LD15]])
+; SSE2-NEXT:    store float [[FLOOR0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[FLOOR1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[FLOOR2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[FLOOR3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[FLOOR4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[FLOOR5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[FLOOR6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[FLOOR7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    store float [[FLOOR8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
+; SSE2-NEXT:    store float [[FLOOR9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE2-NEXT:    store float [[FLOOR10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+; SSE2-NEXT:    store float [[FLOOR11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE2-NEXT:    store float [[FLOOR12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+; SSE2-NEXT:    store float [[FLOOR13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE2-NEXT:    store float [[FLOOR14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+; SSE2-NEXT:    store float [[FLOOR15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @floor_16f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @floor_16f32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @floor_16f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @floor_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
+  %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
+  %ld3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
+  %ld5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
+  %ld7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4
+  %ld9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %floor0  = call float @llvm.floor.f32(float %ld0 )
+  %floor1  = call float @llvm.floor.f32(float %ld1 )
+  %floor2  = call float @llvm.floor.f32(float %ld2 )
+  %floor3  = call float @llvm.floor.f32(float %ld3 )
+  %floor4  = call float @llvm.floor.f32(float %ld4 )
+  %floor5  = call float @llvm.floor.f32(float %ld5 )
+  %floor6  = call float @llvm.floor.f32(float %ld6 )
+  %floor7  = call float @llvm.floor.f32(float %ld7 )
+  %floor8  = call float @llvm.floor.f32(float %ld8 )
+  %floor9  = call float @llvm.floor.f32(float %ld9 )
+  %floor10 = call float @llvm.floor.f32(float %ld10)
+  %floor11 = call float @llvm.floor.f32(float %ld11)
+  %floor12 = call float @llvm.floor.f32(float %ld12)
+  %floor13 = call float @llvm.floor.f32(float %ld13)
+  %floor14 = call float @llvm.floor.f32(float %ld14)
+  %floor15 = call float @llvm.floor.f32(float %ld15)
+  store float %floor0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4
+  store float %floor1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %floor2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4
+  store float %floor3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %floor4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4
+  store float %floor5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %floor6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4
+  store float %floor7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %floor8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4
+  store float %floor9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %floor10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %floor11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %floor12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %floor13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %floor14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %floor15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @nearbyint_4f32() #0 {
+; SSE2-LABEL: @nearbyint_4f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
+; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @nearbyint_4f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @nearbyint_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0)
+  %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1)
+  %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2)
+  %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3)
+  store float %nearbyint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %nearbyint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %nearbyint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %nearbyint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @nearbyint_8f32() #0 {
+; SSE2-LABEL: @nearbyint_8f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]])
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]])
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]])
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]])
+; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @nearbyint_8f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @nearbyint_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %nearbyint0 = call float @llvm.nearbyint.f32(float %ld0)
+  %nearbyint1 = call float @llvm.nearbyint.f32(float %ld1)
+  %nearbyint2 = call float @llvm.nearbyint.f32(float %ld2)
+  %nearbyint3 = call float @llvm.nearbyint.f32(float %ld3)
+  %nearbyint4 = call float @llvm.nearbyint.f32(float %ld4)
+  %nearbyint5 = call float @llvm.nearbyint.f32(float %ld5)
+  %nearbyint6 = call float @llvm.nearbyint.f32(float %ld6)
+  %nearbyint7 = call float @llvm.nearbyint.f32(float %ld7)
+  store float %nearbyint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %nearbyint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %nearbyint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %nearbyint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %nearbyint4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %nearbyint5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %nearbyint6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %nearbyint7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @nearbyint_16f32() #0 {
+; SSE2-LABEL: @nearbyint_16f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
+; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
+; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[NEARBYINT0:%.*]] = call float @llvm.nearbyint.f32(float [[LD0]])
+; SSE2-NEXT:    [[NEARBYINT1:%.*]] = call float @llvm.nearbyint.f32(float [[LD1]])
+; SSE2-NEXT:    [[NEARBYINT2:%.*]] = call float @llvm.nearbyint.f32(float [[LD2]])
+; SSE2-NEXT:    [[NEARBYINT3:%.*]] = call float @llvm.nearbyint.f32(float [[LD3]])
+; SSE2-NEXT:    [[NEARBYINT4:%.*]] = call float @llvm.nearbyint.f32(float [[LD4]])
+; SSE2-NEXT:    [[NEARBYINT5:%.*]] = call float @llvm.nearbyint.f32(float [[LD5]])
+; SSE2-NEXT:    [[NEARBYINT6:%.*]] = call float @llvm.nearbyint.f32(float [[LD6]])
+; SSE2-NEXT:    [[NEARBYINT7:%.*]] = call float @llvm.nearbyint.f32(float [[LD7]])
+; SSE2-NEXT:    [[NEARBYINT8:%.*]] = call float @llvm.nearbyint.f32(float [[LD8]])
+; SSE2-NEXT:    [[NEARBYINT9:%.*]] = call float @llvm.nearbyint.f32(float [[LD9]])
+; SSE2-NEXT:    [[NEARBYINT10:%.*]] = call float @llvm.nearbyint.f32(float [[LD10]])
+; SSE2-NEXT:    [[NEARBYINT11:%.*]] = call float @llvm.nearbyint.f32(float [[LD11]])
+; SSE2-NEXT:    [[NEARBYINT12:%.*]] = call float @llvm.nearbyint.f32(float [[LD12]])
+; SSE2-NEXT:    [[NEARBYINT13:%.*]] = call float @llvm.nearbyint.f32(float [[LD13]])
+; SSE2-NEXT:    [[NEARBYINT14:%.*]] = call float @llvm.nearbyint.f32(float [[LD14]])
+; SSE2-NEXT:    [[NEARBYINT15:%.*]] = call float @llvm.nearbyint.f32(float [[LD15]])
+; SSE2-NEXT:    store float [[NEARBYINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[NEARBYINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[NEARBYINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[NEARBYINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[NEARBYINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[NEARBYINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[NEARBYINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[NEARBYINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    store float [[NEARBYINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
+; SSE2-NEXT:    store float [[NEARBYINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE2-NEXT:    store float [[NEARBYINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+; SSE2-NEXT:    store float [[NEARBYINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE2-NEXT:    store float [[NEARBYINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+; SSE2-NEXT:    store float [[NEARBYINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE2-NEXT:    store float [[NEARBYINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+; SSE2-NEXT:    store float [[NEARBYINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @nearbyint_16f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @nearbyint_16f32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @nearbyint_16f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @nearbyint_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
+  %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
+  %ld3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
+  %ld5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
+  %ld7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4
+  %ld9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %nearbyint0  = call float @llvm.nearbyint.f32(float %ld0 )
+  %nearbyint1  = call float @llvm.nearbyint.f32(float %ld1 )
+  %nearbyint2  = call float @llvm.nearbyint.f32(float %ld2 )
+  %nearbyint3  = call float @llvm.nearbyint.f32(float %ld3 )
+  %nearbyint4  = call float @llvm.nearbyint.f32(float %ld4 )
+  %nearbyint5  = call float @llvm.nearbyint.f32(float %ld5 )
+  %nearbyint6  = call float @llvm.nearbyint.f32(float %ld6 )
+  %nearbyint7  = call float @llvm.nearbyint.f32(float %ld7 )
+  %nearbyint8  = call float @llvm.nearbyint.f32(float %ld8 )
+  %nearbyint9  = call float @llvm.nearbyint.f32(float %ld9 )
+  %nearbyint10 = call float @llvm.nearbyint.f32(float %ld10)
+  %nearbyint11 = call float @llvm.nearbyint.f32(float %ld11)
+  %nearbyint12 = call float @llvm.nearbyint.f32(float %ld12)
+  %nearbyint13 = call float @llvm.nearbyint.f32(float %ld13)
+  %nearbyint14 = call float @llvm.nearbyint.f32(float %ld14)
+  %nearbyint15 = call float @llvm.nearbyint.f32(float %ld15)
+  store float %nearbyint0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4
+  store float %nearbyint1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %nearbyint2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4
+  store float %nearbyint3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %nearbyint4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4
+  store float %nearbyint5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %nearbyint6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4
+  store float %nearbyint7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %nearbyint8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4
+  store float %nearbyint9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %nearbyint10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %nearbyint11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %nearbyint12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %nearbyint13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %nearbyint14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %nearbyint15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @rint_4f32() #0 {
+; SSE2-LABEL: @rint_4f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
+; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @rint_4f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @rint_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %rint0 = call float @llvm.rint.f32(float %ld0)
+  %rint1 = call float @llvm.rint.f32(float %ld1)
+  %rint2 = call float @llvm.rint.f32(float %ld2)
+  %rint3 = call float @llvm.rint.f32(float %ld3)
+  store float %rint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %rint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %rint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %rint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @rint_8f32() #0 {
+; SSE2-LABEL: @rint_8f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
+; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]])
+; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]])
+; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]])
+; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]])
+; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @rint_8f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @rint_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %rint0 = call float @llvm.rint.f32(float %ld0)
+  %rint1 = call float @llvm.rint.f32(float %ld1)
+  %rint2 = call float @llvm.rint.f32(float %ld2)
+  %rint3 = call float @llvm.rint.f32(float %ld3)
+  %rint4 = call float @llvm.rint.f32(float %ld4)
+  %rint5 = call float @llvm.rint.f32(float %ld5)
+  %rint6 = call float @llvm.rint.f32(float %ld6)
+  %rint7 = call float @llvm.rint.f32(float %ld7)
+  store float %rint0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %rint1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %rint2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %rint3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %rint4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %rint5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %rint6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %rint7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @rint_16f32() #0 {
+; SSE2-LABEL: @rint_16f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
+; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
+; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[RINT0:%.*]] = call float @llvm.rint.f32(float [[LD0]])
+; SSE2-NEXT:    [[RINT1:%.*]] = call float @llvm.rint.f32(float [[LD1]])
+; SSE2-NEXT:    [[RINT2:%.*]] = call float @llvm.rint.f32(float [[LD2]])
+; SSE2-NEXT:    [[RINT3:%.*]] = call float @llvm.rint.f32(float [[LD3]])
+; SSE2-NEXT:    [[RINT4:%.*]] = call float @llvm.rint.f32(float [[LD4]])
+; SSE2-NEXT:    [[RINT5:%.*]] = call float @llvm.rint.f32(float [[LD5]])
+; SSE2-NEXT:    [[RINT6:%.*]] = call float @llvm.rint.f32(float [[LD6]])
+; SSE2-NEXT:    [[RINT7:%.*]] = call float @llvm.rint.f32(float [[LD7]])
+; SSE2-NEXT:    [[RINT8:%.*]] = call float @llvm.rint.f32(float [[LD8]])
+; SSE2-NEXT:    [[RINT9:%.*]] = call float @llvm.rint.f32(float [[LD9]])
+; SSE2-NEXT:    [[RINT10:%.*]] = call float @llvm.rint.f32(float [[LD10]])
+; SSE2-NEXT:    [[RINT11:%.*]] = call float @llvm.rint.f32(float [[LD11]])
+; SSE2-NEXT:    [[RINT12:%.*]] = call float @llvm.rint.f32(float [[LD12]])
+; SSE2-NEXT:    [[RINT13:%.*]] = call float @llvm.rint.f32(float [[LD13]])
+; SSE2-NEXT:    [[RINT14:%.*]] = call float @llvm.rint.f32(float [[LD14]])
+; SSE2-NEXT:    [[RINT15:%.*]] = call float @llvm.rint.f32(float [[LD15]])
+; SSE2-NEXT:    store float [[RINT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[RINT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[RINT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[RINT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[RINT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[RINT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[RINT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[RINT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    store float [[RINT8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
+; SSE2-NEXT:    store float [[RINT9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE2-NEXT:    store float [[RINT10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+; SSE2-NEXT:    store float [[RINT11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE2-NEXT:    store float [[RINT12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+; SSE2-NEXT:    store float [[RINT13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE2-NEXT:    store float [[RINT14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+; SSE2-NEXT:    store float [[RINT15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @rint_16f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @rint_16f32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @rint_16f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @rint_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.rint.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
+  %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
+  %ld3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
+  %ld5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
+  %ld7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4
+  %ld9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %rint0  = call float @llvm.rint.f32(float %ld0 )
+  %rint1  = call float @llvm.rint.f32(float %ld1 )
+  %rint2  = call float @llvm.rint.f32(float %ld2 )
+  %rint3  = call float @llvm.rint.f32(float %ld3 )
+  %rint4  = call float @llvm.rint.f32(float %ld4 )
+  %rint5  = call float @llvm.rint.f32(float %ld5 )
+  %rint6  = call float @llvm.rint.f32(float %ld6 )
+  %rint7  = call float @llvm.rint.f32(float %ld7 )
+  %rint8  = call float @llvm.rint.f32(float %ld8 )
+  %rint9  = call float @llvm.rint.f32(float %ld9 )
+  %rint10 = call float @llvm.rint.f32(float %ld10)
+  %rint11 = call float @llvm.rint.f32(float %ld11)
+  %rint12 = call float @llvm.rint.f32(float %ld12)
+  %rint13 = call float @llvm.rint.f32(float %ld13)
+  %rint14 = call float @llvm.rint.f32(float %ld14)
+  %rint15 = call float @llvm.rint.f32(float %ld15)
+  store float %rint0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4
+  store float %rint1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %rint2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4
+  store float %rint3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %rint4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4
+  store float %rint5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %rint6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4
+  store float %rint7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %rint8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4
+  store float %rint9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %rint10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %rint11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %rint12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %rint13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %rint14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %rint15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @trunc_4f32() #0 {
+; SSE2-LABEL: @trunc_4f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
+; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @trunc_4f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @trunc_4f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; AVX-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %trunc0 = call float @llvm.trunc.f32(float %ld0)
+  %trunc1 = call float @llvm.trunc.f32(float %ld1)
+  %trunc2 = call float @llvm.trunc.f32(float %ld2)
+  %trunc3 = call float @llvm.trunc.f32(float %ld3)
+  store float %trunc0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %trunc1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %trunc2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %trunc3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @trunc_8f32() #0 {
+; SSE2-LABEL: @trunc_8f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]])
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]])
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]])
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]])
+; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @trunc_8f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX-LABEL: @trunc_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %ld0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %ld1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %ld2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %ld3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %ld4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %ld5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %ld6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %ld7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %trunc0 = call float @llvm.trunc.f32(float %ld0)
+  %trunc1 = call float @llvm.trunc.f32(float %ld1)
+  %trunc2 = call float @llvm.trunc.f32(float %ld2)
+  %trunc3 = call float @llvm.trunc.f32(float %ld3)
+  %trunc4 = call float @llvm.trunc.f32(float %ld4)
+  %trunc5 = call float @llvm.trunc.f32(float %ld5)
+  %trunc6 = call float @llvm.trunc.f32(float %ld6)
+  %trunc7 = call float @llvm.trunc.f32(float %ld7)
+  store float %trunc0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %trunc1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %trunc2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %trunc3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %trunc4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %trunc5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %trunc6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %trunc7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @trunc_16f32() #0 {
+; SSE2-LABEL: @trunc_16f32(
+; SSE2-NEXT:    [[LD0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+; SSE2-NEXT:    [[LD1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+; SSE2-NEXT:    [[LD2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+; SSE2-NEXT:    [[LD3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+; SSE2-NEXT:    [[LD4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+; SSE2-NEXT:    [[LD5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+; SSE2-NEXT:    [[LD6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+; SSE2-NEXT:    [[LD7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+; SSE2-NEXT:    [[LD8:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8), align 4
+; SSE2-NEXT:    [[LD9:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9), align 4
+; SSE2-NEXT:    [[LD10:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+; SSE2-NEXT:    [[LD11:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+; SSE2-NEXT:    [[LD12:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+; SSE2-NEXT:    [[LD13:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+; SSE2-NEXT:    [[LD14:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+; SSE2-NEXT:    [[LD15:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+; SSE2-NEXT:    [[TRUNC0:%.*]] = call float @llvm.trunc.f32(float [[LD0]])
+; SSE2-NEXT:    [[TRUNC1:%.*]] = call float @llvm.trunc.f32(float [[LD1]])
+; SSE2-NEXT:    [[TRUNC2:%.*]] = call float @llvm.trunc.f32(float [[LD2]])
+; SSE2-NEXT:    [[TRUNC3:%.*]] = call float @llvm.trunc.f32(float [[LD3]])
+; SSE2-NEXT:    [[TRUNC4:%.*]] = call float @llvm.trunc.f32(float [[LD4]])
+; SSE2-NEXT:    [[TRUNC5:%.*]] = call float @llvm.trunc.f32(float [[LD5]])
+; SSE2-NEXT:    [[TRUNC6:%.*]] = call float @llvm.trunc.f32(float [[LD6]])
+; SSE2-NEXT:    [[TRUNC7:%.*]] = call float @llvm.trunc.f32(float [[LD7]])
+; SSE2-NEXT:    [[TRUNC8:%.*]] = call float @llvm.trunc.f32(float [[LD8]])
+; SSE2-NEXT:    [[TRUNC9:%.*]] = call float @llvm.trunc.f32(float [[LD9]])
+; SSE2-NEXT:    [[TRUNC10:%.*]] = call float @llvm.trunc.f32(float [[LD10]])
+; SSE2-NEXT:    [[TRUNC11:%.*]] = call float @llvm.trunc.f32(float [[LD11]])
+; SSE2-NEXT:    [[TRUNC12:%.*]] = call float @llvm.trunc.f32(float [[LD12]])
+; SSE2-NEXT:    [[TRUNC13:%.*]] = call float @llvm.trunc.f32(float [[LD13]])
+; SSE2-NEXT:    [[TRUNC14:%.*]] = call float @llvm.trunc.f32(float [[LD14]])
+; SSE2-NEXT:    [[TRUNC15:%.*]] = call float @llvm.trunc.f32(float [[LD15]])
+; SSE2-NEXT:    store float [[TRUNC0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+; SSE2-NEXT:    store float [[TRUNC1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE2-NEXT:    store float [[TRUNC2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+; SSE2-NEXT:    store float [[TRUNC3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE2-NEXT:    store float [[TRUNC4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+; SSE2-NEXT:    store float [[TRUNC5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE2-NEXT:    store float [[TRUNC6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+; SSE2-NEXT:    store float [[TRUNC7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE2-NEXT:    store float [[TRUNC8]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8), align 4
+; SSE2-NEXT:    store float [[TRUNC9]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9), align 4
+; SSE2-NEXT:    store float [[TRUNC10]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+; SSE2-NEXT:    store float [[TRUNC11]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+; SSE2-NEXT:    store float [[TRUNC12]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+; SSE2-NEXT:    store float [[TRUNC13]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+; SSE2-NEXT:    store float [[TRUNC14]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+; SSE2-NEXT:    store float [[TRUNC15]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+; SSE2-NEXT:    ret void
+;
+; SSE41-LABEL: @trunc_16f32(
+; SSE41-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]])
+; SSE41-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]])
+; SSE41-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]])
+; SSE41-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]])
+; SSE41-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE41-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE41-NEXT:    ret void
+;
+; AVX1-LABEL: @trunc_16f32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX1-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
+; AVX1-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX1-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @trunc_16f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]])
+; AVX2-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX2-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @trunc_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.trunc.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0 ), align 4
+  %ld1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2 ), align 4
+  %ld3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4 ), align 4
+  %ld5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6 ), align 4
+  %ld7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8 ), align 4
+  %ld9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %ld11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %ld12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %ld13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %ld14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %ld15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %trunc0  = call float @llvm.trunc.f32(float %ld0 )
+  %trunc1  = call float @llvm.trunc.f32(float %ld1 )
+  %trunc2  = call float @llvm.trunc.f32(float %ld2 )
+  %trunc3  = call float @llvm.trunc.f32(float %ld3 )
+  %trunc4  = call float @llvm.trunc.f32(float %ld4 )
+  %trunc5  = call float @llvm.trunc.f32(float %ld5 )
+  %trunc6  = call float @llvm.trunc.f32(float %ld6 )
+  %trunc7  = call float @llvm.trunc.f32(float %ld7 )
+  %trunc8  = call float @llvm.trunc.f32(float %ld8 )
+  %trunc9  = call float @llvm.trunc.f32(float %ld9 )
+  %trunc10 = call float @llvm.trunc.f32(float %ld10)
+  %trunc11 = call float @llvm.trunc.f32(float %ld11)
+  %trunc12 = call float @llvm.trunc.f32(float %ld12)
+  %trunc13 = call float @llvm.trunc.f32(float %ld13)
+  %trunc14 = call float @llvm.trunc.f32(float %ld14)
+  %trunc15 = call float @llvm.trunc.f32(float %ld15)
+  store float %trunc0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 4
+  store float %trunc1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %trunc2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 4
+  store float %trunc3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %trunc4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 4
+  store float %trunc5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %trunc6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 4
+  store float %trunc7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %trunc8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 4
+  store float %trunc9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %trunc10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %trunc11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %trunc12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %trunc13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %trunc14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %trunc15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
new file mode 100644
index 00000000000..ae24e92d9e5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+define void @test1(double* %a, double* %b, double* %c) #0 personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    invoke void @_CxxThrowException(i8* null, i8* null)
+; CHECK-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK:       catch.dispatch:
+; CHECK-NEXT:    [[TMP0:%.*]] = catchswitch within none [label %catch] unwind to caller
+; CHECK:       catch:
+; CHECK-NEXT:    [[TMP1:%.*]] = catchpad within [[TMP0]] [i8* null, i32 64, i8* null]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[B]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP6]]) [ "funclet"(token [[TMP1]]) ]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[C]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    catchret from [[TMP1]] to label [[TRY_CONT:%.*]]
+; CHECK:       try.cont:
+; CHECK-NEXT:    ret void
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  invoke void @_CxxThrowException(i8* null, i8* null)
+  to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %1 = catchpad within %0 [i8* null, i32 64, i8* null]
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @floor(double %mul) #1 [ "funclet"(token %1) ]
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @floor(double %mul5) #1 [ "funclet"(token %1) ]
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %for.cond.cleanup
+  ret void
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare double @floor(double) #1
+
+attributes #0 = { "target-features"="+sse2" }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
new file mode 100644
index 00000000000..a3cce8b0c5e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=slp-vectorizer -S |FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Test if SLP can handle GEP expressions.
+; The test perform the following action:
+;   x->first  = y->first  + 16
+;   x->second = y->second + 16
+
+define void @foo1 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y) {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> <i64 16, i64 16>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
+; CHECK-NEXT:    store <2 x i32*> [[TMP6]], <2 x i32*>* [[TMP8]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %y, i64 0, i32 0
+  %2 = load i32*, i32** %1, align 8
+  %3 = getelementptr inbounds i32, i32* %2, i64 16
+  %4 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %x, i64 0, i32 0
+  store i32* %3, i32** %4, align 8
+  %5 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %y, i64 0, i32 1
+  %6 = load i32*, i32** %5, align 8
+  %7 = getelementptr inbounds i32, i32* %6, i64 16
+  %8 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %x, i64 0, i32 1
+  store i32* %7, i32** %8, align 8
+  ret void
+}
+
+; Test that we don't vectorize GEP expressions if indexes are not constants.
+; We can't produce an efficient code in that case.
+define void @foo2 ({ i32*, i32* }* noalias %x, { i32*, i32* }* noalias %y, i32 %i) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 [[I:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X:%.*]], i64 0, i32 0
+; CHECK-NEXT:    store i32* [[TMP3]], i32** [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 [[I]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1
+; CHECK-NEXT:    store i32* [[TMP7]], i32** [[TMP8]], align 8
+; CHECK-NEXT:    ret void
+;
+  %1 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %y, i64 0, i32 0
+  %2 = load i32*, i32** %1, align 8
+  %3 = getelementptr inbounds i32, i32* %2, i32 %i
+  %4 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %x, i64 0, i32 0
+  store i32* %3, i32** %4, align 8
+  %5 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %y, i64 0, i32 1
+  %6 = load i32*, i32** %5, align 8
+  %7 = getelementptr inbounds i32, i32* %6, i32 %i
+  %8 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %x, i64 0, i32 1
+  store i32* %7, i32** %8, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep_mismatch.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep_mismatch.ll
new file mode 100644
index 00000000000..f9b9995066a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gep_mismatch.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -slp-vectorizer | FileCheck %s
+
+; This code has GEPs with different index types, which should not
+; matter for the SLPVectorizer.
+
+target triple = "x86_64--linux"
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[LS1_PH:%.*]] = phi float* [ [[_TMP1:%.*]], [[BB1]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[LS2_PH:%.*]] = phi float* [ [[_TMP2:%.*]], [[BB1]] ], [ undef, [[ENTRY]] ]
+; CHECK-NEXT:    store float undef, float* [[LS1_PH]]
+; CHECK-NEXT:    [[_TMP1]] = getelementptr float, float* [[LS1_PH]], i32 1
+; CHECK-NEXT:    [[_TMP2]] = getelementptr float, float* [[LS2_PH]], i64 4
+; CHECK-NEXT:    br i1 false, label [[BB1]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb1
+
+bb1:
+  %ls1.ph = phi float* [ %_tmp1, %bb1 ], [ undef, %entry ]
+  %ls2.ph = phi float* [ %_tmp2, %bb1 ], [ undef, %entry ]
+  store float undef, float* %ls1.ph
+  %_tmp1 = getelementptr float, float* %ls1.ph, i32 1
+  %_tmp2 = getelementptr float, float* %ls2.ph, i64 4
+  br i1 false, label %bb1, label %bb2
+
+bb2:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
new file mode 100644
index 00000000000..b02244f9614
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -0,0 +1,527 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; 128-bit vectors
+;
+
+define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: @test_v2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <2 x double> [[TMP3]]
+;
+; SLM-LABEL: @test_v2f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
+; SLM-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fadd double [[B0]], [[B1]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> undef, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    ret <2 x double> [[R01]]
+;
+; AVX-LABEL: @test_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <2 x double> [[TMP3]]
+;
+; AVX512-LABEL: @test_v2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %r0 = fadd double %a0, %a1
+  %r1 = fadd double %b0, %b1
+  %r00 = insertelement <2 x double> undef, double %r0, i32 0
+  %r01 = insertelement <2 x double>  %r00, double %r1, i32 1
+  ret <2 x double> %r01
+}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %r0 = fadd float %a0, %a1
+  %r1 = fadd float %a2, %a3
+  %r2 = fadd float %b0, %b1
+  %r3 = fadd float %b2, %b3
+  %r00 = insertelement <4 x float> undef, float %r0, i32 0
+  %r01 = insertelement <4 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <4 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <4 x float>  %r02, float %r3, i32 3
+  ret <4 x float> %r03
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: @test_v2i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
+;
+; SLM-LABEL: @test_v2i64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
+; SLM-NEXT:    [[R0:%.*]] = add i64 [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = add i64 [[B0]], [[B1]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[R01]]
+;
+; AVX-LABEL: @test_v2i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
+;
+; AVX512-LABEL: @test_v2i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %a0 = extractelement <2 x i64> %a, i32 0
+  %a1 = extractelement <2 x i64> %a, i32 1
+  %b0 = extractelement <2 x i64> %b, i32 0
+  %b1 = extractelement <2 x i64> %b, i32 1
+  %r0 = add i64 %a0, %a1
+  %r1 = add i64 %b0, %b1
+  %r00 = insertelement <2 x i64> undef, i64 %r0, i32 0
+  %r01 = insertelement <2 x i64>  %r00, i64 %r1, i32 1
+  ret <2 x i64> %r01
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @test_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %r0 = add i32 %a0, %a1
+  %r1 = add i32 %a2, %a3
+  %r2 = add i32 %b0, %b1
+  %r3 = add i32 %b2, %b3
+  %r00 = insertelement <4 x i32> undef, i32 %r0, i32 0
+  %r01 = insertelement <4 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <4 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <4 x i32>  %r02, i32 %r3, i32 3
+  ret <4 x i32> %r03
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @test_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %r0 = add i16 %a0, %a1
+  %r1 = add i16 %a2, %a3
+  %r2 = add i16 %a4, %a5
+  %r3 = add i16 %a6, %a7
+  %r4 = add i16 %b0, %b1
+  %r5 = add i16 %b2, %b3
+  %r6 = add i16 %b4, %b5
+  %r7 = add i16 %b6, %b7
+  %r00 = insertelement <8 x i16> undef, i16 %r0, i32 0
+  %r01 = insertelement <8 x i16>  %r00, i16 %r1, i32 1
+  %r02 = insertelement <8 x i16>  %r01, i16 %r2, i32 2
+  %r03 = insertelement <8 x i16>  %r02, i16 %r3, i32 3
+  %r04 = insertelement <8 x i16>  %r03, i16 %r4, i32 4
+  %r05 = insertelement <8 x i16>  %r04, i16 %r5, i32 5
+  %r06 = insertelement <8 x i16>  %r05, i16 %r6, i32 6
+  %r07 = insertelement <8 x i16>  %r06, i16 %r7, i32 7
+  ret <8 x i16> %r07
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R03]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R03]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX512-LABEL: @test_v4f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %r0 = fadd double %a0, %a1
+  %r1 = fadd double %b0, %b1
+  %r2 = fadd double %a2, %a3
+  %r3 = fadd double %b2, %b3
+  %r00 = insertelement <4 x double> undef, double %r0, i32 0
+  %r01 = insertelement <4 x double>  %r00, double %r1, i32 1
+  %r02 = insertelement <4 x double>  %r01, double %r2, i32 2
+  %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
+  ret <4 x double> %r03
+}
+
+define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R07]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R07]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+; AVX512-LABEL: @test_v8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX512-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %r0 = fadd float %a0, %a1
+  %r1 = fadd float %a2, %a3
+  %r2 = fadd float %b0, %b1
+  %r3 = fadd float %b2, %b3
+  %r4 = fadd float %a4, %a5
+  %r5 = fadd float %a6, %a7
+  %r6 = fadd float %b4, %b5
+  %r7 = fadd float %b6, %b7
+  %r00 = insertelement <8 x float> undef, float %r0, i32 0
+  %r01 = insertelement <8 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <8 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <8 x float>  %r02, float %r3, i32 3
+  %r04 = insertelement <8 x float>  %r03, float %r4, i32 4
+  %r05 = insertelement <8 x float>  %r04, float %r5, i32 5
+  %r06 = insertelement <8 x float>  %r05, float %r6, i32 6
+  %r07 = insertelement <8 x float>  %r06, float %r7, i32 7
+  ret <8 x float> %r07
+}
+
+define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[R03]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[R03]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+;
+; AVX512-LABEL: @test_v4i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX512-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %a0 = extractelement <4 x i64> %a, i32 0
+  %a1 = extractelement <4 x i64> %a, i32 1
+  %a2 = extractelement <4 x i64> %a, i32 2
+  %a3 = extractelement <4 x i64> %a, i32 3
+  %b0 = extractelement <4 x i64> %b, i32 0
+  %b1 = extractelement <4 x i64> %b, i32 1
+  %b2 = extractelement <4 x i64> %b, i32 2
+  %b3 = extractelement <4 x i64> %b, i32 3
+  %r0 = add i64 %a0, %a1
+  %r1 = add i64 %b0, %b1
+  %r2 = add i64 %a2, %a3
+  %r3 = add i64 %b2, %b3
+  %r00 = insertelement <4 x i64> undef, i64 %r0, i32 0
+  %r01 = insertelement <4 x i64>  %r00, i64 %r1, i32 1
+  %r02 = insertelement <4 x i64>  %r01, i64 %r2, i32 2
+  %r03 = insertelement <4 x i64>  %r02, i64 %r3, i32 3
+  ret <4 x i64> %r03
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R07]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R07]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+;
+; AVX512-LABEL: @test_v8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX512-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %r0 = add i32 %a0, %a1
+  %r1 = add i32 %a2, %a3
+  %r2 = add i32 %b0, %b1
+  %r3 = add i32 %b2, %b3
+  %r4 = add i32 %a4, %a5
+  %r5 = add i32 %a6, %a7
+  %r6 = add i32 %b4, %b5
+  %r7 = add i32 %b6, %b7
+  %r00 = insertelement <8 x i32> undef, i32 %r0, i32 0
+  %r01 = insertelement <8 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <8 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <8 x i32>  %r02, i32 %r3, i32 3
+  %r04 = insertelement <8 x i32>  %r03, i32 %r4, i32 4
+  %r05 = insertelement <8 x i32>  %r04, i32 %r5, i32 5
+  %r06 = insertelement <8 x i32>  %r05, i32 %r6, i32 6
+  %r07 = insertelement <8 x i32>  %r06, i32 %r7, i32 7
+  ret <8 x i32> %r07
+}
+
+define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX512-LABEL: @test_v16i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX512-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %a0  = extractelement <16 x i16> %a, i32 0
+  %a1  = extractelement <16 x i16> %a, i32 1
+  %a2  = extractelement <16 x i16> %a, i32 2
+  %a3  = extractelement <16 x i16> %a, i32 3
+  %a4  = extractelement <16 x i16> %a, i32 4
+  %a5  = extractelement <16 x i16> %a, i32 5
+  %a6  = extractelement <16 x i16> %a, i32 6
+  %a7  = extractelement <16 x i16> %a, i32 7
+  %a8  = extractelement <16 x i16> %a, i32 8
+  %a9  = extractelement <16 x i16> %a, i32 9
+  %a10 = extractelement <16 x i16> %a, i32 10
+  %a11 = extractelement <16 x i16> %a, i32 11
+  %a12 = extractelement <16 x i16> %a, i32 12
+  %a13 = extractelement <16 x i16> %a, i32 13
+  %a14 = extractelement <16 x i16> %a, i32 14
+  %a15 = extractelement <16 x i16> %a, i32 15
+  %b0  = extractelement <16 x i16> %b, i32 0
+  %b1  = extractelement <16 x i16> %b, i32 1
+  %b2  = extractelement <16 x i16> %b, i32 2
+  %b3  = extractelement <16 x i16> %b, i32 3
+  %b4  = extractelement <16 x i16> %b, i32 4
+  %b5  = extractelement <16 x i16> %b, i32 5
+  %b6  = extractelement <16 x i16> %b, i32 6
+  %b7  = extractelement <16 x i16> %b, i32 7
+  %b8  = extractelement <16 x i16> %b, i32 8
+  %b9  = extractelement <16 x i16> %b, i32 9
+  %b10 = extractelement <16 x i16> %b, i32 10
+  %b11 = extractelement <16 x i16> %b, i32 11
+  %b12 = extractelement <16 x i16> %b, i32 12
+  %b13 = extractelement <16 x i16> %b, i32 13
+  %b14 = extractelement <16 x i16> %b, i32 14
+  %b15 = extractelement <16 x i16> %b, i32 15
+  %r0  = add i16 %a0 , %a1
+  %r1  = add i16 %a2 , %a3
+  %r2  = add i16 %a4 , %a5
+  %r3  = add i16 %a6 , %a7
+  %r4  = add i16 %b0 , %b1
+  %r5  = add i16 %b2 , %b3
+  %r6  = add i16 %b4 , %b5
+  %r7  = add i16 %b6 , %b7
+  %r8  = add i16 %a8 , %a9
+  %r9  = add i16 %a10, %a11
+  %r10 = add i16 %a12, %a13
+  %r11 = add i16 %a14, %a15
+  %r12 = add i16 %b8 , %b9
+  %r13 = add i16 %b10, %b11
+  %r14 = add i16 %b12, %b13
+  %r15 = add i16 %b14, %b15
+  %rv0  = insertelement <16 x i16> undef, i16 %r0 , i32 0
+  %rv1  = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
+  %rv2  = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
+  %rv3  = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
+  %rv4  = insertelement <16 x i16> %rv3 , i16 %r4 , i32 4
+  %rv5  = insertelement <16 x i16> %rv4 , i16 %r5 , i32 5
+  %rv6  = insertelement <16 x i16> %rv5 , i16 %r6 , i32 6
+  %rv7  = insertelement <16 x i16> %rv6 , i16 %r7 , i32 7
+  %rv8  = insertelement <16 x i16> %rv7 , i16 %r8 , i32 8
+  %rv9  = insertelement <16 x i16> %rv8 , i16 %r9 , i32 9
+  %rv10 = insertelement <16 x i16> %rv9 , i16 %r10, i32 10
+  %rv11 = insertelement <16 x i16> %rv10, i16 %r11, i32 11
+  %rv12 = insertelement <16 x i16> %rv11, i16 %r12, i32 12
+  %rv13 = insertelement <16 x i16> %rv12, i16 %r13, i32 13
+  %rv14 = insertelement <16 x i16> %rv13, i16 %r14, i32 14
+  %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
+  ret <16 x i16> %rv15
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll
new file mode 100644
index 00000000000..5abf85f319b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hoist.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+;int foo(int *A, int n, int k) {
+;  for (int i=0; i < 10000; i+=4) {
+;    A[i]   += n;
+;    A[i+1] += k;
+;    A[i+2] += n;
+;    A[i+3] += k;
+;  }
+;}
+
+define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[N:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[ADD10]] = add nsw i32 [[I_024]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD10]], 10000
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ 0, %entry ], [ %add10, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.024
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %n
+  store i32 %add, i32* %arrayidx, align 4
+  %add121 = or i32 %i.024, 1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %add121
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %1, %k
+  store i32 %add3, i32* %arrayidx2, align 4
+  %add422 = or i32 %i.024, 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i32 %add422
+  %2 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %2, %n
+  store i32 %add6, i32* %arrayidx5, align 4
+  %add723 = or i32 %i.024, 3
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add723
+  %3 = load i32, i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %3, %k
+  store i32 %add9, i32* %arrayidx8, align 4
+  %add10 = add nsw i32 %i.024, 4
+  %cmp = icmp slt i32 %add10, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
new file mode 100644
index 00000000000..6c3994b0a22
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -0,0 +1,1733 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-threshold=-10 | FileCheck %s --check-prefix=THRESHOLD
+
+@n = external local_unnamed_addr global i32, align 4
+@arr = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
+@arr1 = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
+@res = external local_unnamed_addr global float, align 4
+
+define float @baz() {
+; CHECK-LABEL: @baz(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
+; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
+; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
+; CHECK-NEXT:    store float [[ADD19_3]], float* @res, align 4
+; CHECK-NEXT:    ret float [[ADD19_3]]
+;
+; THRESHOLD-LABEL: @baz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
+; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
+; THRESHOLD-NEXT:    store float [[ADD19_3]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[ADD19_3]]
+;
+entry:
+  %0 = load i32, i32* @n, align 4
+  %mul = mul nsw i32 %0, 3
+  %conv = sitofp i32 %mul to float
+  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
+  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
+  %mul4 = fmul fast float %2, %1
+  %add = fadd fast float %mul4, %conv
+  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
+  %mul4.1 = fmul fast float %4, %3
+  %add.1 = fadd fast float %mul4.1, %add
+  %5 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+  %mul4.2 = fmul fast float %6, %5
+  %add.2 = fadd fast float %mul4.2, %add.1
+  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+  %8 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+  %mul4.3 = fmul fast float %8, %7
+  %add.3 = fadd fast float %mul4.3, %add.2
+  %add7 = fadd fast float %add.3, %conv
+  %add19 = fadd fast float %mul4, %add7
+  %add19.1 = fadd fast float %mul4.1, %add19
+  %add19.2 = fadd fast float %mul4.2, %add19.1
+  %add19.3 = fadd fast float %mul4.3, %add19.2
+  store float %add19.3, float* @res, align 4
+  ret float %add19.3
+}
+
+define float @bazz() {
+; CHECK-LABEL: @bazz(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
+; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
+; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
+; CHECK-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
+; CHECK-NEXT:    ret float [[OP_EXTRA5]]
+;
+; THRESHOLD-LABEL: @bazz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
+; THRESHOLD-NEXT:    store float [[OP_EXTRA5]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
+;
+entry:
+  %0 = load i32, i32* @n, align 4
+  %mul = mul nsw i32 %0, 3
+  %conv = sitofp i32 %mul to float
+  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
+  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
+  %mul4 = fmul fast float %2, %1
+  %add = fadd fast float %mul4, %conv
+  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
+  %mul4.1 = fmul fast float %4, %3
+  %add.1 = fadd fast float %mul4.1, %add
+  %5 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+  %mul4.2 = fmul fast float %6, %5
+  %add.2 = fadd fast float %mul4.2, %add.1
+  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+  %8 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+  %mul4.3 = fmul fast float %8, %7
+  %add.3 = fadd fast float %mul4.3, %add.2
+  %mul5 = shl nsw i32 %0, 2
+  %conv6 = sitofp i32 %mul5 to float
+  %add7 = fadd fast float %add.3, %conv6
+  %9 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4), align 16
+  %10 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4), align 16
+  %mul18 = fmul fast float %10, %9
+  %add19 = fadd fast float %mul18, %add7
+  %11 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 5), align 4
+  %12 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 5), align 4
+  %mul18.1 = fmul fast float %12, %11
+  %add19.1 = fadd fast float %mul18.1, %add19
+  %13 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6), align 8
+  %14 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6), align 8
+  %mul18.2 = fmul fast float %14, %13
+  %add19.2 = fadd fast float %mul18.2, %add19.1
+  %15 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 7), align 4
+  %16 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 7), align 4
+  %mul18.3 = fmul fast float %16, %15
+  %add19.3 = fadd fast float %mul18.3, %add19.2
+  store float %add19.3, float* @res, align 4
+  ret float %add19.3
+}
+
+define float @bazzz() {
+; CHECK-LABEL: @bazzz(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; CHECK-NEXT:    store float [[TMP8]], float* @res, align 4
+; CHECK-NEXT:    ret float [[TMP8]]
+;
+; THRESHOLD-LABEL: @bazzz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; THRESHOLD-NEXT:    store float [[TMP8]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP8]]
+;
+entry:
+  %0 = load i32, i32* @n, align 4
+  %conv = sitofp i32 %0 to float
+  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
+  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
+  %mul = fmul fast float %2, %1
+  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
+  %mul.1 = fmul fast float %4, %3
+  %5 = fadd fast float %mul.1, %mul
+  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+  %mul.2 = fmul fast float %7, %6
+  %8 = fadd fast float %mul.2, %5
+  %9 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+  %10 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+  %mul.3 = fmul fast float %10, %9
+  %11 = fadd fast float %mul.3, %8
+  %12 = fmul fast float %conv, %11
+  store float %12, float* @res, align 4
+  ret float %12
+}
+
+define i32 @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
+; CHECK-NEXT:    store i32 [[CONV4]], i32* @n, align 4
+; CHECK-NEXT:    ret i32 [[CONV4]]
+;
+; THRESHOLD-LABEL: @foo(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
+; THRESHOLD-NEXT:    store i32 [[CONV4]], i32* @n, align 4
+; THRESHOLD-NEXT:    ret i32 [[CONV4]]
+;
+entry:
+  %0 = load i32, i32* @n, align 4
+  %conv = sitofp i32 %0 to float
+  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
+  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
+  %mul = fmul fast float %2, %1
+  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
+  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
+  %mul.1 = fmul fast float %4, %3
+  %5 = fadd fast float %mul.1, %mul
+  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+  %mul.2 = fmul fast float %7, %6
+  %8 = fadd fast float %mul.2, %5
+  %9 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+  %10 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+  %mul.3 = fmul fast float %10, %9
+  %11 = fadd fast float %mul.3, %8
+  %12 = fmul fast float %conv, %11
+  %conv4 = fptosi float %12 to i32
+  store i32 %conv4, i32* @n, align 4
+  ret i32 %conv4
+}
+
+define float @bar() {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float undef, undef
+; CHECK-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
+; CHECK-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], undef
+; CHECK-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
+; CHECK-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; CHECK-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
+; CHECK-NEXT:    store float [[TMP3]], float* @res, align 4
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+; THRESHOLD-LABEL: @bar(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float undef, undef
+; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float undef, float undef
+; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], undef
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float undef
+; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], undef
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP2]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP2]], <4 x float> [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float undef
+; THRESHOLD-NEXT:    store float [[TMP3]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP3]]
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
+  %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
+  %mul = fmul fast float %1, %0
+  %2 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4
+  %3 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4
+  %mul3 = fmul fast float %3, %2
+  %cmp4 = fcmp fast ogt float %mul, %mul3
+  %max.0.mul3 = select i1 %cmp4, float %mul, float %mul3
+  %4 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+  %5 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+  %mul3.1 = fmul fast float %5, %4
+  %cmp4.1 = fcmp fast ogt float %max.0.mul3, %mul3.1
+  %max.0.mul3.1 = select i1 %cmp4.1, float %max.0.mul3, float %mul3.1
+  %6 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+  %7 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+  %mul3.2 = fmul fast float %7, %6
+  %cmp4.2 = fcmp fast ogt float %max.0.mul3.1, %mul3.2
+  %max.0.mul3.2 = select i1 %cmp4.2, float %max.0.mul3.1, float %mul3.2
+  store float %max.0.mul3.2, float* @res, align 4
+  ret float %max.0.mul3.2
+}
+
+define float @f(float* nocapture readonly %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
+; CHECK-NEXT:    [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33
+; CHECK-NEXT:    [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34
+; CHECK-NEXT:    [[ARRAYIDX_35:%.*]] = getelementptr inbounds float, float* [[X]], i64 35
+; CHECK-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds float, float* [[X]], i64 36
+; CHECK-NEXT:    [[ARRAYIDX_37:%.*]] = getelementptr inbounds float, float* [[X]], i64 37
+; CHECK-NEXT:    [[ARRAYIDX_38:%.*]] = getelementptr inbounds float, float* [[X]], i64 38
+; CHECK-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds float, float* [[X]], i64 39
+; CHECK-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds float, float* [[X]], i64 40
+; CHECK-NEXT:    [[ARRAYIDX_41:%.*]] = getelementptr inbounds float, float* [[X]], i64 41
+; CHECK-NEXT:    [[ARRAYIDX_42:%.*]] = getelementptr inbounds float, float* [[X]], i64 42
+; CHECK-NEXT:    [[ARRAYIDX_43:%.*]] = getelementptr inbounds float, float* [[X]], i64 43
+; CHECK-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds float, float* [[X]], i64 44
+; CHECK-NEXT:    [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45
+; CHECK-NEXT:    [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46
+; CHECK-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; CHECK-NEXT:    [[ADD_32:%.*]] = fadd fast float undef, [[ADD_31]]
+; CHECK-NEXT:    [[ADD_33:%.*]] = fadd fast float undef, [[ADD_32]]
+; CHECK-NEXT:    [[ADD_34:%.*]] = fadd fast float undef, [[ADD_33]]
+; CHECK-NEXT:    [[ADD_35:%.*]] = fadd fast float undef, [[ADD_34]]
+; CHECK-NEXT:    [[ADD_36:%.*]] = fadd fast float undef, [[ADD_35]]
+; CHECK-NEXT:    [[ADD_37:%.*]] = fadd fast float undef, [[ADD_36]]
+; CHECK-NEXT:    [[ADD_38:%.*]] = fadd fast float undef, [[ADD_37]]
+; CHECK-NEXT:    [[ADD_39:%.*]] = fadd fast float undef, [[ADD_38]]
+; CHECK-NEXT:    [[ADD_40:%.*]] = fadd fast float undef, [[ADD_39]]
+; CHECK-NEXT:    [[ADD_41:%.*]] = fadd fast float undef, [[ADD_40]]
+; CHECK-NEXT:    [[ADD_42:%.*]] = fadd fast float undef, [[ADD_41]]
+; CHECK-NEXT:    [[ADD_43:%.*]] = fadd fast float undef, [[ADD_42]]
+; CHECK-NEXT:    [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]]
+; CHECK-NEXT:    [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]]
+; CHECK-NEXT:    [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]]
+; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
+; CHECK-NEXT:    ret float [[OP_RDX]]
+;
+; THRESHOLD-LABEL: @f(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
+; THRESHOLD-NEXT:    [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33
+; THRESHOLD-NEXT:    [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34
+; THRESHOLD-NEXT:    [[ARRAYIDX_35:%.*]] = getelementptr inbounds float, float* [[X]], i64 35
+; THRESHOLD-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds float, float* [[X]], i64 36
+; THRESHOLD-NEXT:    [[ARRAYIDX_37:%.*]] = getelementptr inbounds float, float* [[X]], i64 37
+; THRESHOLD-NEXT:    [[ARRAYIDX_38:%.*]] = getelementptr inbounds float, float* [[X]], i64 38
+; THRESHOLD-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds float, float* [[X]], i64 39
+; THRESHOLD-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds float, float* [[X]], i64 40
+; THRESHOLD-NEXT:    [[ARRAYIDX_41:%.*]] = getelementptr inbounds float, float* [[X]], i64 41
+; THRESHOLD-NEXT:    [[ARRAYIDX_42:%.*]] = getelementptr inbounds float, float* [[X]], i64 42
+; THRESHOLD-NEXT:    [[ARRAYIDX_43:%.*]] = getelementptr inbounds float, float* [[X]], i64 43
+; THRESHOLD-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds float, float* [[X]], i64 44
+; THRESHOLD-NEXT:    [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45
+; THRESHOLD-NEXT:    [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46
+; THRESHOLD-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; THRESHOLD-NEXT:    [[ADD_32:%.*]] = fadd fast float undef, [[ADD_31]]
+; THRESHOLD-NEXT:    [[ADD_33:%.*]] = fadd fast float undef, [[ADD_32]]
+; THRESHOLD-NEXT:    [[ADD_34:%.*]] = fadd fast float undef, [[ADD_33]]
+; THRESHOLD-NEXT:    [[ADD_35:%.*]] = fadd fast float undef, [[ADD_34]]
+; THRESHOLD-NEXT:    [[ADD_36:%.*]] = fadd fast float undef, [[ADD_35]]
+; THRESHOLD-NEXT:    [[ADD_37:%.*]] = fadd fast float undef, [[ADD_36]]
+; THRESHOLD-NEXT:    [[ADD_38:%.*]] = fadd fast float undef, [[ADD_37]]
+; THRESHOLD-NEXT:    [[ADD_39:%.*]] = fadd fast float undef, [[ADD_38]]
+; THRESHOLD-NEXT:    [[ADD_40:%.*]] = fadd fast float undef, [[ADD_39]]
+; THRESHOLD-NEXT:    [[ADD_41:%.*]] = fadd fast float undef, [[ADD_40]]
+; THRESHOLD-NEXT:    [[ADD_42:%.*]] = fadd fast float undef, [[ADD_41]]
+; THRESHOLD-NEXT:    [[ADD_43:%.*]] = fadd fast float undef, [[ADD_42]]
+; THRESHOLD-NEXT:    [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]]
+; THRESHOLD-NEXT:    [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]]
+; THRESHOLD-NEXT:    [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]]
+; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; THRESHOLD-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
+; THRESHOLD-NEXT:    ret float [[OP_RDX]]
+;
+  entry:
+  %0 = load float, float* %x, align 4
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx.1, align 4
+  %add.1 = fadd fast float %1, %0
+  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx.2, align 4
+  %add.2 = fadd fast float %2, %add.1
+  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx.3, align 4
+  %add.3 = fadd fast float %3, %add.2
+  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx.4, align 4
+  %add.4 = fadd fast float %4, %add.3
+  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx.5, align 4
+  %add.5 = fadd fast float %5, %add.4
+  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx.6, align 4
+  %add.6 = fadd fast float %6, %add.5
+  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx.7, align 4
+  %add.7 = fadd fast float %7, %add.6
+  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 8
+  %8 = load float, float* %arrayidx.8, align 4
+  %add.8 = fadd fast float %8, %add.7
+  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 9
+  %9 = load float, float* %arrayidx.9, align 4
+  %add.9 = fadd fast float %9, %add.8
+  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 10
+  %10 = load float, float* %arrayidx.10, align 4
+  %add.10 = fadd fast float %10, %add.9
+  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 11
+  %11 = load float, float* %arrayidx.11, align 4
+  %add.11 = fadd fast float %11, %add.10
+  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 12
+  %12 = load float, float* %arrayidx.12, align 4
+  %add.12 = fadd fast float %12, %add.11
+  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 13
+  %13 = load float, float* %arrayidx.13, align 4
+  %add.13 = fadd fast float %13, %add.12
+  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 14
+  %14 = load float, float* %arrayidx.14, align 4
+  %add.14 = fadd fast float %14, %add.13
+  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 15
+  %15 = load float, float* %arrayidx.15, align 4
+  %add.15 = fadd fast float %15, %add.14
+  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 16
+  %16 = load float, float* %arrayidx.16, align 4
+  %add.16 = fadd fast float %16, %add.15
+  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 17
+  %17 = load float, float* %arrayidx.17, align 4
+  %add.17 = fadd fast float %17, %add.16
+  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 18
+  %18 = load float, float* %arrayidx.18, align 4
+  %add.18 = fadd fast float %18, %add.17
+  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 19
+  %19 = load float, float* %arrayidx.19, align 4
+  %add.19 = fadd fast float %19, %add.18
+  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 20
+  %20 = load float, float* %arrayidx.20, align 4
+  %add.20 = fadd fast float %20, %add.19
+  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 21
+  %21 = load float, float* %arrayidx.21, align 4
+  %add.21 = fadd fast float %21, %add.20
+  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 22
+  %22 = load float, float* %arrayidx.22, align 4
+  %add.22 = fadd fast float %22, %add.21
+  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 23
+  %23 = load float, float* %arrayidx.23, align 4
+  %add.23 = fadd fast float %23, %add.22
+  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 24
+  %24 = load float, float* %arrayidx.24, align 4
+  %add.24 = fadd fast float %24, %add.23
+  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 25
+  %25 = load float, float* %arrayidx.25, align 4
+  %add.25 = fadd fast float %25, %add.24
+  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 26
+  %26 = load float, float* %arrayidx.26, align 4
+  %add.26 = fadd fast float %26, %add.25
+  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 27
+  %27 = load float, float* %arrayidx.27, align 4
+  %add.27 = fadd fast float %27, %add.26
+  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 28
+  %28 = load float, float* %arrayidx.28, align 4
+  %add.28 = fadd fast float %28, %add.27
+  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 29
+  %29 = load float, float* %arrayidx.29, align 4
+  %add.29 = fadd fast float %29, %add.28
+  %arrayidx.30 = getelementptr inbounds float, float* %x, i64 30
+  %30 = load float, float* %arrayidx.30, align 4
+  %add.30 = fadd fast float %30, %add.29
+  %arrayidx.31 = getelementptr inbounds float, float* %x, i64 31
+  %31 = load float, float* %arrayidx.31, align 4
+  %add.31 = fadd fast float %31, %add.30
+  %arrayidx.32 = getelementptr inbounds float, float* %x, i64 32
+  %32 = load float, float* %arrayidx.32, align 4
+  %add.32 = fadd fast float %32, %add.31
+  %arrayidx.33 = getelementptr inbounds float, float* %x, i64 33
+  %33 = load float, float* %arrayidx.33, align 4
+  %add.33 = fadd fast float %33, %add.32
+  %arrayidx.34 = getelementptr inbounds float, float* %x, i64 34
+  %34 = load float, float* %arrayidx.34, align 4
+  %add.34 = fadd fast float %34, %add.33
+  %arrayidx.35 = getelementptr inbounds float, float* %x, i64 35
+  %35 = load float, float* %arrayidx.35, align 4
+  %add.35 = fadd fast float %35, %add.34
+  %arrayidx.36 = getelementptr inbounds float, float* %x, i64 36
+  %36 = load float, float* %arrayidx.36, align 4
+  %add.36 = fadd fast float %36, %add.35
+  %arrayidx.37 = getelementptr inbounds float, float* %x, i64 37
+  %37 = load float, float* %arrayidx.37, align 4
+  %add.37 = fadd fast float %37, %add.36
+  %arrayidx.38 = getelementptr inbounds float, float* %x, i64 38
+  %38 = load float, float* %arrayidx.38, align 4
+  %add.38 = fadd fast float %38, %add.37
+  %arrayidx.39 = getelementptr inbounds float, float* %x, i64 39
+  %39 = load float, float* %arrayidx.39, align 4
+  %add.39 = fadd fast float %39, %add.38
+  %arrayidx.40 = getelementptr inbounds float, float* %x, i64 40
+  %40 = load float, float* %arrayidx.40, align 4
+  %add.40 = fadd fast float %40, %add.39
+  %arrayidx.41 = getelementptr inbounds float, float* %x, i64 41
+  %41 = load float, float* %arrayidx.41, align 4
+  %add.41 = fadd fast float %41, %add.40
+  %arrayidx.42 = getelementptr inbounds float, float* %x, i64 42
+  %42 = load float, float* %arrayidx.42, align 4
+  %add.42 = fadd fast float %42, %add.41
+  %arrayidx.43 = getelementptr inbounds float, float* %x, i64 43
+  %43 = load float, float* %arrayidx.43, align 4
+  %add.43 = fadd fast float %43, %add.42
+  %arrayidx.44 = getelementptr inbounds float, float* %x, i64 44
+  %44 = load float, float* %arrayidx.44, align 4
+  %add.44 = fadd fast float %44, %add.43
+  %arrayidx.45 = getelementptr inbounds float, float* %x, i64 45
+  %45 = load float, float* %arrayidx.45, align 4
+  %add.45 = fadd fast float %45, %add.44
+  %arrayidx.46 = getelementptr inbounds float, float* %x, i64 46
+  %46 = load float, float* %arrayidx.46, align 4
+  %add.46 = fadd fast float %46, %add.45
+  %arrayidx.47 = getelementptr inbounds float, float* %x, i64 47
+  %47 = load float, float* %arrayidx.47, align 4
+  %add.47 = fadd fast float %47, %add.46
+  ret float %add.47
+}
+
+define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; CHECK-NEXT:    ret float [[OP_EXTRA]]
+;
+; THRESHOLD-LABEL: @f1(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA]]
+;
+  entry:
+  %rem = srem i32 %a, %b
+  %conv = sitofp i32 %rem to float
+  %0 = load float, float* %x, align 4
+  %add = fadd fast float %0, %conv
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx.1, align 4
+  %add.1 = fadd fast float %1, %add
+  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx.2, align 4
+  %add.2 = fadd fast float %2, %add.1
+  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx.3, align 4
+  %add.3 = fadd fast float %3, %add.2
+  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx.4, align 4
+  %add.4 = fadd fast float %4, %add.3
+  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx.5, align 4
+  %add.5 = fadd fast float %5, %add.4
+  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx.6, align 4
+  %add.6 = fadd fast float %6, %add.5
+  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx.7, align 4
+  %add.7 = fadd fast float %7, %add.6
+  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 8
+  %8 = load float, float* %arrayidx.8, align 4
+  %add.8 = fadd fast float %8, %add.7
+  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 9
+  %9 = load float, float* %arrayidx.9, align 4
+  %add.9 = fadd fast float %9, %add.8
+  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 10
+  %10 = load float, float* %arrayidx.10, align 4
+  %add.10 = fadd fast float %10, %add.9
+  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 11
+  %11 = load float, float* %arrayidx.11, align 4
+  %add.11 = fadd fast float %11, %add.10
+  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 12
+  %12 = load float, float* %arrayidx.12, align 4
+  %add.12 = fadd fast float %12, %add.11
+  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 13
+  %13 = load float, float* %arrayidx.13, align 4
+  %add.13 = fadd fast float %13, %add.12
+  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 14
+  %14 = load float, float* %arrayidx.14, align 4
+  %add.14 = fadd fast float %14, %add.13
+  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 15
+  %15 = load float, float* %arrayidx.15, align 4
+  %add.15 = fadd fast float %15, %add.14
+  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 16
+  %16 = load float, float* %arrayidx.16, align 4
+  %add.16 = fadd fast float %16, %add.15
+  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 17
+  %17 = load float, float* %arrayidx.17, align 4
+  %add.17 = fadd fast float %17, %add.16
+  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 18
+  %18 = load float, float* %arrayidx.18, align 4
+  %add.18 = fadd fast float %18, %add.17
+  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 19
+  %19 = load float, float* %arrayidx.19, align 4
+  %add.19 = fadd fast float %19, %add.18
+  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 20
+  %20 = load float, float* %arrayidx.20, align 4
+  %add.20 = fadd fast float %20, %add.19
+  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 21
+  %21 = load float, float* %arrayidx.21, align 4
+  %add.21 = fadd fast float %21, %add.20
+  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 22
+  %22 = load float, float* %arrayidx.22, align 4
+  %add.22 = fadd fast float %22, %add.21
+  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 23
+  %23 = load float, float* %arrayidx.23, align 4
+  %add.23 = fadd fast float %23, %add.22
+  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 24
+  %24 = load float, float* %arrayidx.24, align 4
+  %add.24 = fadd fast float %24, %add.23
+  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 25
+  %25 = load float, float* %arrayidx.25, align 4
+  %add.25 = fadd fast float %25, %add.24
+  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 26
+  %26 = load float, float* %arrayidx.26, align 4
+  %add.26 = fadd fast float %26, %add.25
+  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 27
+  %27 = load float, float* %arrayidx.27, align 4
+  %add.27 = fadd fast float %27, %add.26
+  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 28
+  %28 = load float, float* %arrayidx.28, align 4
+  %add.28 = fadd fast float %28, %add.27
+  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 29
+  %29 = load float, float* %arrayidx.29, align 4
+  %add.29 = fadd fast float %29, %add.28
+  %arrayidx.30 = getelementptr inbounds float, float* %x, i64 30
+  %30 = load float, float* %arrayidx.30, align 4
+  %add.30 = fadd fast float %30, %add.29
+  %arrayidx.31 = getelementptr inbounds float, float* %x, i64 31
+  %31 = load float, float* %arrayidx.31, align 4
+  %add.31 = fadd fast float %31, %add.30
+  ret float %add.31
+}
+
+define float @loadadd31(float* nocapture readonly %x) {
+; CHECK-LABEL: @loadadd31(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
+; CHECK-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
+; CHECK-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; CHECK-NEXT:    ret float [[TMP12]]
+;
+; THRESHOLD-LABEL: @loadadd31(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]]
+; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0
+; THRESHOLD-NEXT:    [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]]
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    ret float [[TMP12]]
+;
+  entry:
+  %arrayidx = getelementptr inbounds float, float* %x, i64 1
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx.1 = getelementptr inbounds float, float* %x, i64 2
+  %1 = load float, float* %arrayidx.1, align 4
+  %add.1 = fadd fast float %1, %0
+  %arrayidx.2 = getelementptr inbounds float, float* %x, i64 3
+  %2 = load float, float* %arrayidx.2, align 4
+  %add.2 = fadd fast float %2, %add.1
+  %arrayidx.3 = getelementptr inbounds float, float* %x, i64 4
+  %3 = load float, float* %arrayidx.3, align 4
+  %add.3 = fadd fast float %3, %add.2
+  %arrayidx.4 = getelementptr inbounds float, float* %x, i64 5
+  %4 = load float, float* %arrayidx.4, align 4
+  %add.4 = fadd fast float %4, %add.3
+  %arrayidx.5 = getelementptr inbounds float, float* %x, i64 6
+  %5 = load float, float* %arrayidx.5, align 4
+  %add.5 = fadd fast float %5, %add.4
+  %arrayidx.6 = getelementptr inbounds float, float* %x, i64 7
+  %6 = load float, float* %arrayidx.6, align 4
+  %add.6 = fadd fast float %6, %add.5
+  %arrayidx.7 = getelementptr inbounds float, float* %x, i64 8
+  %7 = load float, float* %arrayidx.7, align 4
+  %add.7 = fadd fast float %7, %add.6
+  %arrayidx.8 = getelementptr inbounds float, float* %x, i64 9
+  %8 = load float, float* %arrayidx.8, align 4
+  %add.8 = fadd fast float %8, %add.7
+  %arrayidx.9 = getelementptr inbounds float, float* %x, i64 10
+  %9 = load float, float* %arrayidx.9, align 4
+  %add.9 = fadd fast float %9, %add.8
+  %arrayidx.10 = getelementptr inbounds float, float* %x, i64 11
+  %10 = load float, float* %arrayidx.10, align 4
+  %add.10 = fadd fast float %10, %add.9
+  %arrayidx.11 = getelementptr inbounds float, float* %x, i64 12
+  %11 = load float, float* %arrayidx.11, align 4
+  %add.11 = fadd fast float %11, %add.10
+  %arrayidx.12 = getelementptr inbounds float, float* %x, i64 13
+  %12 = load float, float* %arrayidx.12, align 4
+  %add.12 = fadd fast float %12, %add.11
+  %arrayidx.13 = getelementptr inbounds float, float* %x, i64 14
+  %13 = load float, float* %arrayidx.13, align 4
+  %add.13 = fadd fast float %13, %add.12
+  %arrayidx.14 = getelementptr inbounds float, float* %x, i64 15
+  %14 = load float, float* %arrayidx.14, align 4
+  %add.14 = fadd fast float %14, %add.13
+  %arrayidx.15 = getelementptr inbounds float, float* %x, i64 16
+  %15 = load float, float* %arrayidx.15, align 4
+  %add.15 = fadd fast float %15, %add.14
+  %arrayidx.16 = getelementptr inbounds float, float* %x, i64 17
+  %16 = load float, float* %arrayidx.16, align 4
+  %add.16 = fadd fast float %16, %add.15
+  %arrayidx.17 = getelementptr inbounds float, float* %x, i64 18
+  %17 = load float, float* %arrayidx.17, align 4
+  %add.17 = fadd fast float %17, %add.16
+  %arrayidx.18 = getelementptr inbounds float, float* %x, i64 19
+  %18 = load float, float* %arrayidx.18, align 4
+  %add.18 = fadd fast float %18, %add.17
+  %arrayidx.19 = getelementptr inbounds float, float* %x, i64 20
+  %19 = load float, float* %arrayidx.19, align 4
+  %add.19 = fadd fast float %19, %add.18
+  %arrayidx.20 = getelementptr inbounds float, float* %x, i64 21
+  %20 = load float, float* %arrayidx.20, align 4
+  %add.20 = fadd fast float %20, %add.19
+  %arrayidx.21 = getelementptr inbounds float, float* %x, i64 22
+  %21 = load float, float* %arrayidx.21, align 4
+  %add.21 = fadd fast float %21, %add.20
+  %arrayidx.22 = getelementptr inbounds float, float* %x, i64 23
+  %22 = load float, float* %arrayidx.22, align 4
+  %add.22 = fadd fast float %22, %add.21
+  %arrayidx.23 = getelementptr inbounds float, float* %x, i64 24
+  %23 = load float, float* %arrayidx.23, align 4
+  %add.23 = fadd fast float %23, %add.22
+  %arrayidx.24 = getelementptr inbounds float, float* %x, i64 25
+  %24 = load float, float* %arrayidx.24, align 4
+  %add.24 = fadd fast float %24, %add.23
+  %arrayidx.25 = getelementptr inbounds float, float* %x, i64 26
+  %25 = load float, float* %arrayidx.25, align 4
+  %add.25 = fadd fast float %25, %add.24
+  %arrayidx.26 = getelementptr inbounds float, float* %x, i64 27
+  %26 = load float, float* %arrayidx.26, align 4
+  %add.26 = fadd fast float %26, %add.25
+  %arrayidx.27 = getelementptr inbounds float, float* %x, i64 28
+  %27 = load float, float* %arrayidx.27, align 4
+  %add.27 = fadd fast float %27, %add.26
+  %arrayidx.28 = getelementptr inbounds float, float* %x, i64 29
+  %28 = load float, float* %arrayidx.28, align 4
+  %add.28 = fadd fast float %28, %add.27
+  %arrayidx.29 = getelementptr inbounds float, float* %x, i64 30
+  %29 = load float, float* %arrayidx.29, align 4
+  %add.29 = fadd fast float %29, %add.28
+  ret float %add.29
+}
+
+define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
+; CHECK-LABEL: @extra_args(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[OP_EXTRA5]]
+;
+; THRESHOLD-LABEL: @extra_args(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
+;
+  entry:
+  %mul = mul nsw i32 %b, %a
+  %conv = sitofp i32 %mul to float
+  %0 = load float, float* %x, align 4
+  %add = fadd fast float %conv, 3.000000e+00
+  %add1 = fadd fast float %0, %add
+  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %1, %add1
+  %add5 = fadd fast float %add4, %conv
+  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx3.1, align 4
+  %add4.1 = fadd fast float %2, %add5
+  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx3.2, align 4
+  %add4.2 = fadd fast float %3, %add4.1
+  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx3.3, align 4
+  %add4.3 = fadd fast float %4, %add4.2
+  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx3.4, align 4
+  %add4.4 = fadd fast float %5, %add4.3
+  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx3.5, align 4
+  %add4.5 = fadd fast float %6, %add4.4
+  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx3.6, align 4
+  %add4.6 = fadd fast float %7, %add4.5
+  ret float %add4.6
+}
+
+define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a, i32 %b) {
+; CHECK-LABEL: @extra_args_same_several_times(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
+; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[OP_EXTRA7]]
+;
+; THRESHOLD-LABEL: @extra_args_same_several_times(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], 5.000000e+00
+; THRESHOLD-NEXT:    [[OP_EXTRA6:%.*]] = fadd fast float [[OP_EXTRA5]], 5.000000e+00
+; THRESHOLD-NEXT:    [[OP_EXTRA7:%.*]] = fadd fast float [[OP_EXTRA6]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA7]]
+;
+  entry:
+  %mul = mul nsw i32 %b, %a
+  %conv = sitofp i32 %mul to float
+  %0 = load float, float* %x, align 4
+  %add = fadd fast float %conv, 3.000000e+00
+  %add1 = fadd fast float %0, %add
+  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %1, %add1
+  %add41 = fadd fast float %add4, 5.000000e+00
+  %add5 = fadd fast float %add41, %conv
+  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx3.1, align 4
+  %add4.1 = fadd fast float %2, %add5
+  %add4.11 = fadd fast float %add4.1, 5.000000e+00
+  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx3.2, align 4
+  %add4.2 = fadd fast float %3, %add4.11
+  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx3.3, align 4
+  %add4.3 = fadd fast float %4, %add4.2
+  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx3.4, align 4
+  %add4.4 = fadd fast float %5, %add4.3
+  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx3.5, align 4
+  %add4.5 = fadd fast float %6, %add4.4
+  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx3.6, align 4
+  %add4.6 = fadd fast float %7, %add4.5
+  ret float %add4.6
+}
+
+define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @extra_args_no_replace(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
+; CHECK-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[OP_EXTRA5]]
+;
+; THRESHOLD-LABEL: @extra_args_no_replace(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
+; THRESHOLD-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[OP_EXTRA5:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[OP_EXTRA5]]
+;
+  entry:
+  %mul = mul nsw i32 %b, %a
+  %conv = sitofp i32 %mul to float
+  %0 = load float, float* %x, align 4
+  %convc = sitofp i32 %c to float
+  %addc = fadd fast float %convc, 3.000000e+00
+  %add = fadd fast float %conv, %addc
+  %add1 = fadd fast float %0, %add
+  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %1, %add1
+  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx3.1, align 4
+  %add4.1 = fadd fast float %2, %add4
+  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx3.2, align 4
+  %add4.2 = fadd fast float %3, %add4.1
+  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx3.3, align 4
+  %add4.3 = fadd fast float %4, %add4.2
+  %add5 = fadd fast float %add4.3, %conv
+  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx3.4, align 4
+  %add4.4 = fadd fast float %5, %add5
+  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx3.5, align 4
+  %add4.5 = fadd fast float %6, %add4.4
+  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx3.6, align 4
+  %add4.6 = fadd fast float %7, %add4.5
+  ret float %add4.6
+}
+
+define i32 @wobble(i32 %arg, i32 %bar) {
+; CHECK-LABEL: @wobble(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
+; CHECK-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
+; CHECK-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
+; CHECK-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; CHECK-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
+; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], [[TMP9]]
+; CHECK-NEXT:    ret i32 [[OP_EXTRA3]]
+;
+; THRESHOLD-LABEL: @wobble(
+; THRESHOLD-NEXT:  bb:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; THRESHOLD-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
+; THRESHOLD-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
+; THRESHOLD-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
+; THRESHOLD-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[OP_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; THRESHOLD-NEXT:    [[OP_EXTRA3:%.*]] = add nsw i32 [[OP_EXTRA]], [[TMP9]]
+; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], [[TMP9]]
+; THRESHOLD-NEXT:    ret i32 [[OP_EXTRA3]]
+;
+  bb:
+  %x1 = xor i32 %arg, %bar
+  %i1 = icmp eq i32 %x1, 0
+  %s1 = sext i1 %i1 to i32
+  %x2 = xor i32 %arg, %bar
+  %i2 = icmp eq i32 %x2, 0
+  %s2 = sext i1 %i2 to i32
+  %x3 = xor i32 %arg, %bar
+  %i3 = icmp eq i32 %x3, 0
+  %s3 = sext i1 %i3 to i32
+  %x4 = xor i32 %arg, %bar
+  %i4 = icmp eq i32 %x4, 0
+  %s4 = sext i1 %i4 to i32
+  %r1 = add nuw i32 %arg, %s1
+  %r2 = add nsw i32 %r1, %s2
+  %r3 = add nsw i32 %r2, %s3
+  %r4 = add nsw i32 %r3, %s4
+  %r5 = add nsw i32 %r4, %x4
+  ret i32 %r5
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
new file mode 100644
index 00000000000..271244e54fc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -0,0 +1,1141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -slp-vectorizer -S -slp-threshold=-100 | FileCheck %s --check-prefixes=CHECK,SKX
+
+@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
+@arr1 = local_unnamed_addr global [32 x float] zeroinitializer, align 16
+@arrp = local_unnamed_addr global [32 x i32*] zeroinitializer, align 16
+@var = global i32 zeroinitializer, align 8
+
+define i32 @maxi8(i32) {
+; CHECK-LABEL: @maxi8(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP2]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP16]]
+;
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+  %4 = icmp sgt i32 %2, %3
+  %5 = select i1 %4, i32 %2, i32 %3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %7 = icmp sgt i32 %5, %6
+  %8 = select i1 %7, i32 %5, i32 %6
+  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %10 = icmp sgt i32 %8, %9
+  %11 = select i1 %10, i32 %8, i32 %9
+  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %13 = icmp sgt i32 %11, %12
+  %14 = select i1 %13, i32 %11, i32 %12
+  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %16 = icmp sgt i32 %14, %15
+  %17 = select i1 %16, i32 %14, i32 %15
+  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+  %19 = icmp sgt i32 %17, %18
+  %20 = select i1 %19, i32 %17, i32 %18
+  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+  %22 = icmp sgt i32 %20, %21
+  %23 = select i1 %22, i32 %20, i32 %21
+  ret i32 %23
+}
+
+define i32 @maxi16(i32) {
+; CHECK-LABEL: @maxi16(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr to <16 x i32>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <16 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i32> [[TMP2]], <16 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i32> [[RDX_MINMAX_SELECT]], <16 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i32> [[RDX_MINMAX_SELECT3]], <16 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <16 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i32> [[RDX_MINMAX_SELECT6]], <16 x i32> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i32> [[RDX_MINMAX_SELECT9]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP32]]
+;
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+  %4 = icmp sgt i32 %2, %3
+  %5 = select i1 %4, i32 %2, i32 %3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %7 = icmp sgt i32 %5, %6
+  %8 = select i1 %7, i32 %5, i32 %6
+  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %10 = icmp sgt i32 %8, %9
+  %11 = select i1 %10, i32 %8, i32 %9
+  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %13 = icmp sgt i32 %11, %12
+  %14 = select i1 %13, i32 %11, i32 %12
+  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %16 = icmp sgt i32 %14, %15
+  %17 = select i1 %16, i32 %14, i32 %15
+  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+  %19 = icmp sgt i32 %17, %18
+  %20 = select i1 %19, i32 %17, i32 %18
+  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+  %22 = icmp sgt i32 %20, %21
+  %23 = select i1 %22, i32 %20, i32 %21
+  %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+  %25 = icmp sgt i32 %23, %24
+  %26 = select i1 %25, i32 %23, i32 %24
+  %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+  %28 = icmp sgt i32 %26, %27
+  %29 = select i1 %28, i32 %26, i32 %27
+  %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+  %31 = icmp sgt i32 %29, %30
+  %32 = select i1 %31, i32 %29, i32 %30
+  %33 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+  %34 = icmp sgt i32 %32, %33
+  %35 = select i1 %34, i32 %32, i32 %33
+  %36 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+  %37 = icmp sgt i32 %35, %36
+  %38 = select i1 %37, i32 %35, i32 %36
+  %39 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+  %40 = icmp sgt i32 %38, %39
+  %41 = select i1 %40, i32 %38, i32 %39
+  %42 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+  %43 = icmp sgt i32 %41, %42
+  %44 = select i1 %43, i32 %41, i32 %42
+  %45 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+  %46 = icmp sgt i32 %44, %45
+  %47 = select i1 %46, i32 %44, i32 %45
+  ret i32 %47
+}
+
+define i32 @maxi32(i32) {
+; CHECK-LABEL: @maxi32(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr to <32 x i32>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 undef
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], undef
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 undef
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], undef
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 undef
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], undef
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 undef
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp sgt i32 [[TMP34]], undef
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 [[TMP34]], i32 undef
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp sgt i32 [[TMP36]], undef
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 [[TMP36]], i32 undef
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp sgt i32 [[TMP38]], undef
+; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], i32 [[TMP38]], i32 undef
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp sgt i32 [[TMP40]], undef
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 [[TMP40]], i32 undef
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], undef
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP42]], i32 undef
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp sgt i32 [[TMP44]], undef
+; CHECK-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], i32 [[TMP44]], i32 undef
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp sgt i32 [[TMP46]], undef
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i32 [[TMP46]], i32 undef
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp sgt i32 [[TMP48]], undef
+; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i32 [[TMP48]], i32 undef
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp sgt i32 [[TMP50]], undef
+; CHECK-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], i32 [[TMP50]], i32 undef
+; CHECK-NEXT:    [[TMP53:%.*]] = icmp sgt i32 [[TMP52]], undef
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], i32 [[TMP52]], i32 undef
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp sgt i32 [[TMP54]], undef
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 [[TMP54]], i32 undef
+; CHECK-NEXT:    [[TMP57:%.*]] = icmp sgt i32 [[TMP56]], undef
+; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], i32 [[TMP56]], i32 undef
+; CHECK-NEXT:    [[TMP59:%.*]] = icmp sgt i32 [[TMP58]], undef
+; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[TMP58]], i32 undef
+; CHECK-NEXT:    [[TMP61:%.*]] = icmp sgt i32 [[TMP60]], undef
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], i32 [[TMP60]], i32 undef
+; CHECK-NEXT:    [[TMP63:%.*]] = icmp sgt i32 [[TMP62]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP2]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <32 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x i32> [[TMP2]], <32 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x i32> [[RDX_MINMAX_SELECT]], <32 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x i32> [[RDX_MINMAX_SELECT3]], <32 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x i32> [[RDX_MINMAX_SELECT6]], <32 x i32> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = icmp sgt <32 x i32> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x i32> [[RDX_MINMAX_SELECT9]], <32 x i32> [[RDX_SHUF10]]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <32 x i32> [[RDX_MINMAX_SELECT12]], i32 0
+; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], i32 [[TMP62]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP64]]
+;
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+  %4 = icmp sgt i32 %2, %3
+  %5 = select i1 %4, i32 %2, i32 %3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %7 = icmp sgt i32 %5, %6
+  %8 = select i1 %7, i32 %5, i32 %6
+  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %10 = icmp sgt i32 %8, %9
+  %11 = select i1 %10, i32 %8, i32 %9
+  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %13 = icmp sgt i32 %11, %12
+  %14 = select i1 %13, i32 %11, i32 %12
+  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %16 = icmp sgt i32 %14, %15
+  %17 = select i1 %16, i32 %14, i32 %15
+  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+  %19 = icmp sgt i32 %17, %18
+  %20 = select i1 %19, i32 %17, i32 %18
+  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+  %22 = icmp sgt i32 %20, %21
+  %23 = select i1 %22, i32 %20, i32 %21
+  %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 8), align 16
+  %25 = icmp sgt i32 %23, %24
+  %26 = select i1 %25, i32 %23, i32 %24
+  %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 9), align 4
+  %28 = icmp sgt i32 %26, %27
+  %29 = select i1 %28, i32 %26, i32 %27
+  %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 10), align 8
+  %31 = icmp sgt i32 %29, %30
+  %32 = select i1 %31, i32 %29, i32 %30
+  %33 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 11), align 4
+  %34 = icmp sgt i32 %32, %33
+  %35 = select i1 %34, i32 %32, i32 %33
+  %36 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 12), align 16
+  %37 = icmp sgt i32 %35, %36
+  %38 = select i1 %37, i32 %35, i32 %36
+  %39 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 13), align 4
+  %40 = icmp sgt i32 %38, %39
+  %41 = select i1 %40, i32 %38, i32 %39
+  %42 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 14), align 8
+  %43 = icmp sgt i32 %41, %42
+  %44 = select i1 %43, i32 %41, i32 %42
+  %45 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 15), align 4
+  %46 = icmp sgt i32 %44, %45
+  %47 = select i1 %46, i32 %44, i32 %45
+  %48 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 16), align 16
+  %49 = icmp sgt i32 %47, %48
+  %50 = select i1 %49, i32 %47, i32 %48
+  %51 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 17), align 4
+  %52 = icmp sgt i32 %50, %51
+  %53 = select i1 %52, i32 %50, i32 %51
+  %54 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 18), align 8
+  %55 = icmp sgt i32 %53, %54
+  %56 = select i1 %55, i32 %53, i32 %54
+  %57 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 19), align 4
+  %58 = icmp sgt i32 %56, %57
+  %59 = select i1 %58, i32 %56, i32 %57
+  %60 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 20), align 16
+  %61 = icmp sgt i32 %59, %60
+  %62 = select i1 %61, i32 %59, i32 %60
+  %63 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 21), align 4
+  %64 = icmp sgt i32 %62, %63
+  %65 = select i1 %64, i32 %62, i32 %63
+  %66 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 22), align 8
+  %67 = icmp sgt i32 %65, %66
+  %68 = select i1 %67, i32 %65, i32 %66
+  %69 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 23), align 4
+  %70 = icmp sgt i32 %68, %69
+  %71 = select i1 %70, i32 %68, i32 %69
+  %72 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 24), align 16
+  %73 = icmp sgt i32 %71, %72
+  %74 = select i1 %73, i32 %71, i32 %72
+  %75 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 25), align 4
+  %76 = icmp sgt i32 %74, %75
+  %77 = select i1 %76, i32 %74, i32 %75
+  %78 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 26), align 8
+  %79 = icmp sgt i32 %77, %78
+  %80 = select i1 %79, i32 %77, i32 %78
+  %81 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 27), align 4
+  %82 = icmp sgt i32 %80, %81
+  %83 = select i1 %82, i32 %80, i32 %81
+  %84 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 28), align 16
+  %85 = icmp sgt i32 %83, %84
+  %86 = select i1 %85, i32 %83, i32 %84
+  %87 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 29), align 4
+  %88 = icmp sgt i32 %86, %87
+  %89 = select i1 %88, i32 %86, i32 %87
+  %90 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 30), align 8
+  %91 = icmp sgt i32 %89, %90
+  %92 = select i1 %91, i32 %89, i32 %90
+  %93 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 31), align 4
+  %94 = icmp sgt i32 %92, %93
+  %95 = select i1 %94, i32 %92, i32 %93
+  ret i32 %95
+}
+
+define float @maxf8(float) {
+; CHECK-LABEL: @maxf8(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr1 to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
+; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <8 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[TMP2]], <8 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x float> [[RDX_MINMAX_SELECT]], <8 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <8 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x float> [[RDX_MINMAX_SELECT3]], <8 x float> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x float> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
+; CHECK-NEXT:    ret float [[TMP16]]
+;
+  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+  %4 = fcmp fast ogt float %2, %3
+  %5 = select i1 %4, float %2, float %3
+  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+  %7 = fcmp fast ogt float %5, %6
+  %8 = select i1 %7, float %5, float %6
+  %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+  %10 = fcmp fast ogt float %8, %9
+  %11 = select i1 %10, float %8, float %9
+  %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+  %13 = fcmp fast ogt float %11, %12
+  %14 = select i1 %13, float %11, float %12
+  %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+  %16 = fcmp fast ogt float %14, %15
+  %17 = select i1 %16, float %14, float %15
+  %18 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+  %19 = fcmp fast ogt float %17, %18
+  %20 = select i1 %19, float %17, float %18
+  %21 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+  %22 = fcmp fast ogt float %20, %21
+  %23 = select i1 %22, float %20, float %21
+  ret float %23
+}
+
+define float @maxf16(float) {
+; CHECK-LABEL: @maxf16(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr1 to <16 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
+; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
+; CHECK-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
+; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
+; CHECK-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
+; CHECK-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
+; CHECK-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
+; CHECK-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
+; CHECK-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
+; CHECK-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <16 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x float> [[TMP2]], <16 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x float> [[RDX_MINMAX_SELECT]], <16 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x float> [[RDX_MINMAX_SELECT3]], <16 x float> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <16 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x float> [[RDX_MINMAX_SELECT6]], <16 x float> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x float> [[RDX_MINMAX_SELECT9]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
+; CHECK-NEXT:    ret float [[TMP32]]
+;
+  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+  %4 = fcmp fast ogt float %2, %3
+  %5 = select i1 %4, float %2, float %3
+  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+  %7 = fcmp fast ogt float %5, %6
+  %8 = select i1 %7, float %5, float %6
+  %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+  %10 = fcmp fast ogt float %8, %9
+  %11 = select i1 %10, float %8, float %9
+  %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+  %13 = fcmp fast ogt float %11, %12
+  %14 = select i1 %13, float %11, float %12
+  %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+  %16 = fcmp fast ogt float %14, %15
+  %17 = select i1 %16, float %14, float %15
+  %18 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+  %19 = fcmp fast ogt float %17, %18
+  %20 = select i1 %19, float %17, float %18
+  %21 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+  %22 = fcmp fast ogt float %20, %21
+  %23 = select i1 %22, float %20, float %21
+  %24 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+  %25 = fcmp fast ogt float %23, %24
+  %26 = select i1 %25, float %23, float %24
+  %27 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+  %28 = fcmp fast ogt float %26, %27
+  %29 = select i1 %28, float %26, float %27
+  %30 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+  %31 = fcmp fast ogt float %29, %30
+  %32 = select i1 %31, float %29, float %30
+  %33 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+  %34 = fcmp fast ogt float %32, %33
+  %35 = select i1 %34, float %32, float %33
+  %36 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+  %37 = fcmp fast ogt float %35, %36
+  %38 = select i1 %37, float %35, float %36
+  %39 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+  %40 = fcmp fast ogt float %38, %39
+  %41 = select i1 %40, float %38, float %39
+  %42 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+  %43 = fcmp fast ogt float %41, %42
+  %44 = select i1 %43, float %41, float %42
+  %45 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+  %46 = fcmp fast ogt float %44, %45
+  %47 = select i1 %46, float %44, float %45
+  ret float %47
+}
+
+define float @maxf32(float) {
+; CHECK-LABEL: @maxf32(
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x float>, <32 x float>* bitcast ([32 x float]* @arr1 to <32 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast ogt float undef, undef
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], float undef, float undef
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp fast ogt float [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], float [[TMP4]], float undef
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp fast ogt float [[TMP6]], undef
+; CHECK-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], float [[TMP6]], float undef
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt float [[TMP8]], undef
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], float [[TMP8]], float undef
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ogt float [[TMP10]], undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], float [[TMP10]], float undef
+; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt float [[TMP12]], undef
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], float [[TMP12]], float undef
+; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast ogt float [[TMP14]], undef
+; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], float [[TMP14]], float undef
+; CHECK-NEXT:    [[TMP17:%.*]] = fcmp fast ogt float [[TMP16]], undef
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], float [[TMP16]], float undef
+; CHECK-NEXT:    [[TMP19:%.*]] = fcmp fast ogt float [[TMP18]], undef
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], float [[TMP18]], float undef
+; CHECK-NEXT:    [[TMP21:%.*]] = fcmp fast ogt float [[TMP20]], undef
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], float [[TMP20]], float undef
+; CHECK-NEXT:    [[TMP23:%.*]] = fcmp fast ogt float [[TMP22]], undef
+; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], float [[TMP22]], float undef
+; CHECK-NEXT:    [[TMP25:%.*]] = fcmp fast ogt float [[TMP24]], undef
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], float [[TMP24]], float undef
+; CHECK-NEXT:    [[TMP27:%.*]] = fcmp fast ogt float [[TMP26]], undef
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], float [[TMP26]], float undef
+; CHECK-NEXT:    [[TMP29:%.*]] = fcmp fast ogt float [[TMP28]], undef
+; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], float [[TMP28]], float undef
+; CHECK-NEXT:    [[TMP31:%.*]] = fcmp fast ogt float [[TMP30]], undef
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], float [[TMP30]], float undef
+; CHECK-NEXT:    [[TMP33:%.*]] = fcmp fast ogt float [[TMP32]], undef
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], float [[TMP32]], float undef
+; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast ogt float [[TMP34]], undef
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], float [[TMP34]], float undef
+; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast ogt float [[TMP36]], undef
+; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], float [[TMP36]], float undef
+; CHECK-NEXT:    [[TMP39:%.*]] = fcmp fast ogt float [[TMP38]], undef
+; CHECK-NEXT:    [[TMP40:%.*]] = select i1 [[TMP39]], float [[TMP38]], float undef
+; CHECK-NEXT:    [[TMP41:%.*]] = fcmp fast ogt float [[TMP40]], undef
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], float [[TMP40]], float undef
+; CHECK-NEXT:    [[TMP43:%.*]] = fcmp fast ogt float [[TMP42]], undef
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], float [[TMP42]], float undef
+; CHECK-NEXT:    [[TMP45:%.*]] = fcmp fast ogt float [[TMP44]], undef
+; CHECK-NEXT:    [[TMP46:%.*]] = select i1 [[TMP45]], float [[TMP44]], float undef
+; CHECK-NEXT:    [[TMP47:%.*]] = fcmp fast ogt float [[TMP46]], undef
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], float [[TMP46]], float undef
+; CHECK-NEXT:    [[TMP49:%.*]] = fcmp fast ogt float [[TMP48]], undef
+; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], float [[TMP48]], float undef
+; CHECK-NEXT:    [[TMP51:%.*]] = fcmp fast ogt float [[TMP50]], undef
+; CHECK-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], float [[TMP50]], float undef
+; CHECK-NEXT:    [[TMP53:%.*]] = fcmp fast ogt float [[TMP52]], undef
+; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[TMP53]], float [[TMP52]], float undef
+; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast ogt float [[TMP54]], undef
+; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], float [[TMP54]], float undef
+; CHECK-NEXT:    [[TMP57:%.*]] = fcmp fast ogt float [[TMP56]], undef
+; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], float [[TMP56]], float undef
+; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast ogt float [[TMP58]], undef
+; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], float [[TMP58]], float undef
+; CHECK-NEXT:    [[TMP61:%.*]] = fcmp fast ogt float [[TMP60]], undef
+; CHECK-NEXT:    [[TMP62:%.*]] = select i1 [[TMP61]], float [[TMP60]], float undef
+; CHECK-NEXT:    [[TMP63:%.*]] = fcmp fast ogt float [[TMP62]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP2]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <32 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP]], <32 x float> [[TMP2]], <32 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP2]], <32 x float> [[RDX_MINMAX_SELECT]], <32 x float> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP5]], <32 x float> [[RDX_MINMAX_SELECT3]], <32 x float> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP8:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT9:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP8]], <32 x float> [[RDX_MINMAX_SELECT6]], <32 x float> [[RDX_SHUF7]]
+; CHECK-NEXT:    [[RDX_SHUF10:%.*]] = shufflevector <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP11:%.*]] = fcmp fast ogt <32 x float> [[RDX_MINMAX_SELECT9]], [[RDX_SHUF10]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT12:%.*]] = select <32 x i1> [[RDX_MINMAX_CMP11]], <32 x float> [[RDX_MINMAX_SELECT9]], <32 x float> [[RDX_SHUF10]]
+; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <32 x float> [[RDX_MINMAX_SELECT12]], i32 0
+; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[TMP63]], float [[TMP62]], float undef
+; CHECK-NEXT:    ret float [[TMP64]]
+;
+  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16
+  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 1), align 4
+  %4 = fcmp fast ogt float %2, %3
+  %5 = select i1 %4, float %2, float %3
+  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 2), align 8
+  %7 = fcmp fast ogt float %5, %6
+  %8 = select i1 %7, float %5, float %6
+  %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 3), align 4
+  %10 = fcmp fast ogt float %8, %9
+  %11 = select i1 %10, float %8, float %9
+  %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 4), align 16
+  %13 = fcmp fast ogt float %11, %12
+  %14 = select i1 %13, float %11, float %12
+  %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 5), align 4
+  %16 = fcmp fast ogt float %14, %15
+  %17 = select i1 %16, float %14, float %15
+  %18 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 6), align 8
+  %19 = fcmp fast ogt float %17, %18
+  %20 = select i1 %19, float %17, float %18
+  %21 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 7), align 4
+  %22 = fcmp fast ogt float %20, %21
+  %23 = select i1 %22, float %20, float %21
+  %24 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 8), align 16
+  %25 = fcmp fast ogt float %23, %24
+  %26 = select i1 %25, float %23, float %24
+  %27 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 9), align 4
+  %28 = fcmp fast ogt float %26, %27
+  %29 = select i1 %28, float %26, float %27
+  %30 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 10), align 8
+  %31 = fcmp fast ogt float %29, %30
+  %32 = select i1 %31, float %29, float %30
+  %33 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 11), align 4
+  %34 = fcmp fast ogt float %32, %33
+  %35 = select i1 %34, float %32, float %33
+  %36 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 12), align 16
+  %37 = fcmp fast ogt float %35, %36
+  %38 = select i1 %37, float %35, float %36
+  %39 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 13), align 4
+  %40 = fcmp fast ogt float %38, %39
+  %41 = select i1 %40, float %38, float %39
+  %42 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 14), align 8
+  %43 = fcmp fast ogt float %41, %42
+  %44 = select i1 %43, float %41, float %42
+  %45 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 15), align 4
+  %46 = fcmp fast ogt float %44, %45
+  %47 = select i1 %46, float %44, float %45
+  %48 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 16), align 16
+  %49 = fcmp fast ogt float %47, %48
+  %50 = select i1 %49, float %47, float %48
+  %51 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 17), align 4
+  %52 = fcmp fast ogt float %50, %51
+  %53 = select i1 %52, float %50, float %51
+  %54 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 18), align 8
+  %55 = fcmp fast ogt float %53, %54
+  %56 = select i1 %55, float %53, float %54
+  %57 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 19), align 4
+  %58 = fcmp fast ogt float %56, %57
+  %59 = select i1 %58, float %56, float %57
+  %60 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 20), align 16
+  %61 = fcmp fast ogt float %59, %60
+  %62 = select i1 %61, float %59, float %60
+  %63 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 21), align 4
+  %64 = fcmp fast ogt float %62, %63
+  %65 = select i1 %64, float %62, float %63
+  %66 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 22), align 8
+  %67 = fcmp fast ogt float %65, %66
+  %68 = select i1 %67, float %65, float %66
+  %69 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 23), align 4
+  %70 = fcmp fast ogt float %68, %69
+  %71 = select i1 %70, float %68, float %69
+  %72 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 24), align 16
+  %73 = fcmp fast ogt float %71, %72
+  %74 = select i1 %73, float %71, float %72
+  %75 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 25), align 4
+  %76 = fcmp fast ogt float %74, %75
+  %77 = select i1 %76, float %74, float %75
+  %78 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 26), align 8
+  %79 = fcmp fast ogt float %77, %78
+  %80 = select i1 %79, float %77, float %78
+  %81 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 27), align 4
+  %82 = fcmp fast ogt float %80, %81
+  %83 = select i1 %82, float %80, float %81
+  %84 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 28), align 16
+  %85 = fcmp fast ogt float %83, %84
+  %86 = select i1 %85, float %83, float %84
+  %87 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 29), align 4
+  %88 = fcmp fast ogt float %86, %87
+  %89 = select i1 %88, float %86, float %87
+  %90 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 30), align 8
+  %91 = fcmp fast ogt float %89, %90
+  %92 = select i1 %91, float %89, float %90
+  %93 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 31), align 4
+  %94 = fcmp fast ogt float %92, %93
+  %95 = select i1 %94, float %92, float %93
+  ret float %95
+}
+
+define i32 @maxi8_mutiple_uses(i32) {
+; SSE-LABEL: @maxi8_mutiple_uses(
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; SSE-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; SSE-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; SSE-NEXT:    [[TMP24:%.*]] = select i1 [[TMP4]], i32 3, i32 4
+; SSE-NEXT:    store i32 [[TMP24]], i32* @var, align 8
+; SSE-NEXT:    ret i32 [[TMP23]]
+;
+; AVX-LABEL: @maxi8_mutiple_uses(
+; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], undef
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 undef
+; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX-NEXT:    [[TMP18:%.*]] = icmp sgt i32 [[TMP17]], [[TMP15]]
+; AVX-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP17]], i32 [[TMP15]]
+; AVX-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], [[TMP5]]
+; AVX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP5]]
+; AVX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP22]]
+; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[OP_EXTRA]], i32 [[TMP22]]
+; AVX-NEXT:    [[TMP25:%.*]] = select i1 [[TMP4]], i32 3, i32 4
+; AVX-NEXT:    store i32 [[TMP25]], i32* @var, align 8
+; AVX-NEXT:    ret i32 [[TMP24]]
+;
+; AVX2-LABEL: @maxi8_mutiple_uses(
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], undef
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 undef
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp sgt i32 [[TMP17]], [[TMP15]]
+; AVX2-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP17]], i32 [[TMP15]]
+; AVX2-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], [[TMP5]]
+; AVX2-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP5]]
+; AVX2-NEXT:    [[TMP21:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX2-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP22]]
+; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[OP_EXTRA]], i32 [[TMP22]]
+; AVX2-NEXT:    [[TMP25:%.*]] = select i1 [[TMP4]], i32 3, i32 4
+; AVX2-NEXT:    store i32 [[TMP25]], i32* @var, align 8
+; AVX2-NEXT:    ret i32 [[TMP24]]
+;
+; SKX-LABEL: @maxi8_mutiple_uses(
+; SKX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; SKX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; SKX-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; SKX-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP5]], <4 x i32> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; SKX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; SKX-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
+; SKX-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1
+; SKX-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0
+; SKX-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[TMP4]], i32 1
+; SKX-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP9]], [[TMP11]]
+; SKX-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP9]], <2 x i32> [[TMP11]]
+; SKX-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; SKX-NEXT:    [[TMP15:%.*]] = icmp sgt i32 [[TMP14]], undef
+; SKX-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 [[TMP14]], i32 undef
+; SKX-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP16]], undef
+; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP16]], i32 undef
+; SKX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP18]], undef
+; SKX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP18]], i32 undef
+; SKX-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
+; SKX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
+; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], [[TMP6]]
+; SKX-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP14]]
+; SKX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP14]]
+; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP6]]
+; SKX-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP27]]
+; SKX-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 [[OP_EXTRA]], i32 [[TMP27]]
+; SKX-NEXT:    [[TMP30:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; SKX-NEXT:    [[TMP31:%.*]] = select i1 [[TMP30]], i32 3, i32 4
+; SKX-NEXT:    store i32 [[TMP31]], i32* @var, align 8
+; SKX-NEXT:    ret i32 [[TMP29]]
+;
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+  %4 = icmp sgt i32 %2, %3
+  %5 = select i1 %4, i32 %2, i32 %3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %7 = icmp sgt i32 %5, %6
+  %8 = select i1 %7, i32 %5, i32 %6
+  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %10 = icmp sgt i32 %8, %9
+  %11 = select i1 %10, i32 %8, i32 %9
+  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %13 = icmp sgt i32 %11, %12
+  %14 = select i1 %13, i32 %11, i32 %12
+  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %16 = icmp sgt i32 %14, %15
+  %17 = select i1 %16, i32 %14, i32 %15
+  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+  %19 = icmp sgt i32 %17, %18
+  %20 = select i1 %19, i32 %17, i32 %18
+  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+  %22 = icmp sgt i32 %20, %21
+  %23 = select i1 %22, i32 %20, i32 %21
+  %24 = select i1 %4, i32 3, i32 4
+  store i32 %24, i32* @var, align 8
+  ret i32 %23
+}
+
+define i32 @maxi8_wrong_parent(i32) {
+; SSE-LABEL: @maxi8_wrong_parent(
+; SSE-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; SSE-NEXT:    br label [[PP:%.*]]
+; SSE:       pp:
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+; SSE-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; SSE-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; SSE-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32 [[TMP20]], i32 [[TMP21]]
+; SSE-NEXT:    ret i32 [[TMP23]]
+;
+; AVX-LABEL: @maxi8_wrong_parent(
+; AVX-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX-NEXT:    br label [[PP:%.*]]
+; AVX:       pp:
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], undef
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 undef
+; AVX-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; AVX-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; AVX-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; AVX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; AVX-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; AVX-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
+; AVX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; AVX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; AVX-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], [[TMP15]]
+; AVX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP15]]
+; AVX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], [[TMP18]]
+; AVX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP18]]
+; AVX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP5]]
+; AVX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP5]]
+; AVX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX-NEXT:    ret i32 [[OP_EXTRA]]
+;
+; AVX2-LABEL: @maxi8_wrong_parent(
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    br label [[PP:%.*]]
+; AVX2:       pp:
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], undef
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP5]], i32 undef
+; AVX2-NEXT:    [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], undef
+; AVX2-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 undef
+; AVX2-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], undef
+; AVX2-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp sgt i32 [[TMP12]], undef
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP12]], i32 undef
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP14]], i32 [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; AVX2-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
+; AVX2-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; AVX2-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; AVX2-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; AVX2-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; AVX2-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], [[TMP15]]
+; AVX2-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 [[TMP15]]
+; AVX2-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], [[TMP18]]
+; AVX2-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 [[TMP18]]
+; AVX2-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], [[TMP5]]
+; AVX2-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 [[TMP5]]
+; AVX2-NEXT:    [[TMP26:%.*]] = select i1 [[TMP19]], i32 [[TMP17]], i32 [[TMP18]]
+; AVX2-NEXT:    ret i32 [[OP_EXTRA]]
+;
+; SKX-LABEL: @maxi8_wrong_parent(
+; SKX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; SKX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; SKX-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
+; SKX-NEXT:    br label [[PP:%.*]]
+; SKX:       pp:
+; SKX-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8
+; SKX-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+; SKX-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; SKX-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]]
+; SKX-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; SKX-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; SKX-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; SKX-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; SKX-NEXT:    [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]]
+; SKX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]]
+; SKX-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
+; SKX-NEXT:    [[TMP13:%.*]] = insertelement <2 x i1> undef, i1 [[TMP12]], i32 0
+; SKX-NEXT:    [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1
+; SKX-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i32 0
+; SKX-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1
+; SKX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> undef, i32 [[TMP8]], i32 0
+; SKX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1
+; SKX-NEXT:    [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]]
+; SKX-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1
+; SKX-NEXT:    [[TMP21:%.*]] = icmp sgt i32 [[TMP20]], undef
+; SKX-NEXT:    [[TMP22:%.*]] = select i1 [[TMP21]], i32 [[TMP20]], i32 undef
+; SKX-NEXT:    [[TMP23:%.*]] = icmp sgt i32 [[TMP22]], undef
+; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32 [[TMP22]], i32 undef
+; SKX-NEXT:    [[TMP25:%.*]] = icmp sgt i32 [[TMP24]], undef
+; SKX-NEXT:    [[TMP26:%.*]] = select i1 [[TMP25]], i32 [[TMP24]], i32 undef
+; SKX-NEXT:    [[TMP27:%.*]] = icmp sgt i32 [[TMP26]], undef
+; SKX-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 [[TMP26]], i32 undef
+; SKX-NEXT:    [[TMP29:%.*]] = icmp sgt i32 [[TMP28]], [[TMP7]]
+; SKX-NEXT:    [[TMP30:%.*]] = select i1 [[TMP29]], i32 [[TMP28]], i32 [[TMP7]]
+; SKX-NEXT:    [[TMP31:%.*]] = icmp sgt i32 [[TMP30]], [[TMP8]]
+; SKX-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0
+; SKX-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], [[TMP20]]
+; SKX-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP33]], i32 [[TMP32]], i32 [[TMP20]]
+; SKX-NEXT:    [[TMP34:%.*]] = select i1 [[TMP31]], i32 [[TMP30]], i32 [[TMP8]]
+; SKX-NEXT:    ret i32 [[OP_EXTRA]]
+;
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4
+  %4 = icmp sgt i32 %2, %3
+  br label %pp
+
+pp:
+  %5 = select i1 %4, i32 %2, i32 %3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2), align 8
+  %7 = icmp sgt i32 %5, %6
+  %8 = select i1 %7, i32 %5, i32 %6
+  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 3), align 4
+  %10 = icmp sgt i32 %8, %9
+  %11 = select i1 %10, i32 %8, i32 %9
+  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16
+  %13 = icmp sgt i32 %11, %12
+  %14 = select i1 %13, i32 %11, i32 %12
+  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4
+  %16 = icmp sgt i32 %14, %15
+  %17 = select i1 %16, i32 %14, i32 %15
+  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8
+  %19 = icmp sgt i32 %17, %18
+  %20 = select i1 %19, i32 %17, i32 %18
+  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4
+  %22 = icmp sgt i32 %20, %21
+  %23 = select i1 %22, i32 %20, i32 %21
+  ret i32 %23
+}
+
+; PR38191 - We don't handle array-of-pointer reductions.
+define i32* @maxp8(i32) {
+; SSE-LABEL: @maxp8(
+; SSE-NEXT:    [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32* [[TMP2]], i32* [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 2), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = icmp ugt i32* [[TMP5]], [[TMP6]]
+; SSE-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32* [[TMP5]], i32* [[TMP6]]
+; SSE-NEXT:    [[TMP9:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP10:%.*]] = icmp ugt i32* [[TMP8]], [[TMP9]]
+; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32* [[TMP8]], i32* [[TMP9]]
+; SSE-NEXT:    [[TMP12:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 4), align 16
+; SSE-NEXT:    [[TMP13:%.*]] = icmp ugt i32* [[TMP11]], [[TMP12]]
+; SSE-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32* [[TMP11]], i32* [[TMP12]]
+; SSE-NEXT:    [[TMP15:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 5), align 4
+; SSE-NEXT:    [[TMP16:%.*]] = icmp ugt i32* [[TMP14]], [[TMP15]]
+; SSE-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32* [[TMP14]], i32* [[TMP15]]
+; SSE-NEXT:    [[TMP18:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 6), align 8
+; SSE-NEXT:    [[TMP19:%.*]] = icmp ugt i32* [[TMP17]], [[TMP18]]
+; SSE-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32* [[TMP17]], i32* [[TMP18]]
+; SSE-NEXT:    [[TMP21:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 7), align 4
+; SSE-NEXT:    [[TMP22:%.*]] = icmp ugt i32* [[TMP20]], [[TMP21]]
+; SSE-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]]
+; SSE-NEXT:    ret i32* [[TMP23]]
+;
+; AVX-LABEL: @maxp8(
+; AVX-NEXT:    [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16
+; AVX-NEXT:    [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32* [[TMP2]], i32* [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 2), align 8
+; AVX-NEXT:    [[TMP7:%.*]] = icmp ugt i32* [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32* [[TMP5]], i32* [[TMP6]]
+; AVX-NEXT:    [[TMP9:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 3), align 4
+; AVX-NEXT:    [[TMP10:%.*]] = icmp ugt i32* [[TMP8]], [[TMP9]]
+; AVX-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32* [[TMP8]], i32* [[TMP9]]
+; AVX-NEXT:    [[TMP12:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 4), align 16
+; AVX-NEXT:    [[TMP13:%.*]] = icmp ugt i32* [[TMP11]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32* [[TMP11]], i32* [[TMP12]]
+; AVX-NEXT:    [[TMP15:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 5), align 4
+; AVX-NEXT:    [[TMP16:%.*]] = icmp ugt i32* [[TMP14]], [[TMP15]]
+; AVX-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32* [[TMP14]], i32* [[TMP15]]
+; AVX-NEXT:    [[TMP18:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 6), align 8
+; AVX-NEXT:    [[TMP19:%.*]] = icmp ugt i32* [[TMP17]], [[TMP18]]
+; AVX-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32* [[TMP17]], i32* [[TMP18]]
+; AVX-NEXT:    [[TMP21:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 7), align 4
+; AVX-NEXT:    [[TMP22:%.*]] = icmp ugt i32* [[TMP20]], [[TMP21]]
+; AVX-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]]
+; AVX-NEXT:    ret i32* [[TMP23]]
+;
+; AVX2-LABEL: @maxp8(
+; AVX2-NEXT:    [[TMP2:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16
+; AVX2-NEXT:    [[TMP3:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = icmp ugt i32* [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i32* [[TMP2]], i32* [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 2), align 8
+; AVX2-NEXT:    [[TMP7:%.*]] = icmp ugt i32* [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = select i1 [[TMP7]], i32* [[TMP5]], i32* [[TMP6]]
+; AVX2-NEXT:    [[TMP9:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 3), align 4
+; AVX2-NEXT:    [[TMP10:%.*]] = icmp ugt i32* [[TMP8]], [[TMP9]]
+; AVX2-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i32* [[TMP8]], i32* [[TMP9]]
+; AVX2-NEXT:    [[TMP12:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 4), align 16
+; AVX2-NEXT:    [[TMP13:%.*]] = icmp ugt i32* [[TMP11]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32* [[TMP11]], i32* [[TMP12]]
+; AVX2-NEXT:    [[TMP15:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 5), align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = icmp ugt i32* [[TMP14]], [[TMP15]]
+; AVX2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32* [[TMP14]], i32* [[TMP15]]
+; AVX2-NEXT:    [[TMP18:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 6), align 8
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp ugt i32* [[TMP17]], [[TMP18]]
+; AVX2-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32* [[TMP17]], i32* [[TMP18]]
+; AVX2-NEXT:    [[TMP21:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 7), align 4
+; AVX2-NEXT:    [[TMP22:%.*]] = icmp ugt i32* [[TMP20]], [[TMP21]]
+; AVX2-NEXT:    [[TMP23:%.*]] = select i1 [[TMP22]], i32* [[TMP20]], i32* [[TMP21]]
+; AVX2-NEXT:    ret i32* [[TMP23]]
+;
+; SKX-LABEL: @maxp8(
+; SKX-NEXT:    [[TMP2:%.*]] = load <2 x i32*>, <2 x i32*>* bitcast ([32 x i32*]* @arrp to <2 x i32*>*), align 16
+; SKX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32*> [[TMP2]], i32 0
+; SKX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32*> [[TMP2]], i32 1
+; SKX-NEXT:    [[TMP5:%.*]] = icmp ugt i32* [[TMP3]], [[TMP4]]
+; SKX-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32* [[TMP3]], i32* [[TMP4]]
+; SKX-NEXT:    [[TMP7:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 2), align 8
+; SKX-NEXT:    [[TMP8:%.*]] = icmp ugt i32* [[TMP6]], [[TMP7]]
+; SKX-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32* [[TMP6]], i32* [[TMP7]]
+; SKX-NEXT:    [[TMP10:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 3), align 4
+; SKX-NEXT:    [[TMP11:%.*]] = icmp ugt i32* [[TMP9]], [[TMP10]]
+; SKX-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32* [[TMP9]], i32* [[TMP10]]
+; SKX-NEXT:    [[TMP13:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 4), align 16
+; SKX-NEXT:    [[TMP14:%.*]] = icmp ugt i32* [[TMP12]], [[TMP13]]
+; SKX-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32* [[TMP12]], i32* [[TMP13]]
+; SKX-NEXT:    [[TMP16:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 5), align 4
+; SKX-NEXT:    [[TMP17:%.*]] = icmp ugt i32* [[TMP15]], [[TMP16]]
+; SKX-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32* [[TMP15]], i32* [[TMP16]]
+; SKX-NEXT:    [[TMP19:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 6), align 8
+; SKX-NEXT:    [[TMP20:%.*]] = icmp ugt i32* [[TMP18]], [[TMP19]]
+; SKX-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32* [[TMP18]], i32* [[TMP19]]
+; SKX-NEXT:    [[TMP22:%.*]] = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 7), align 4
+; SKX-NEXT:    [[TMP23:%.*]] = icmp ugt i32* [[TMP21]], [[TMP22]]
+; SKX-NEXT:    [[TMP24:%.*]] = select i1 [[TMP23]], i32* [[TMP21]], i32* [[TMP22]]
+; SKX-NEXT:    ret i32* [[TMP24]]
+;
+  %2 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 0), align 16
+  %3 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 1), align 4
+  %4 = icmp ugt i32* %2, %3
+  %5 = select i1 %4, i32* %2, i32* %3
+  %6 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 2), align 8
+  %7 = icmp ugt i32* %5, %6
+  %8 = select i1 %7, i32* %5, i32* %6
+  %9 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 3), align 4
+  %10 = icmp ugt i32* %8, %9
+  %11 = select i1 %10, i32* %8, i32* %9
+  %12 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 4), align 16
+  %13 = icmp ugt i32* %11, %12
+  %14 = select i1 %13, i32* %11, i32* %12
+  %15 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 5), align 4
+  %16 = icmp ugt i32* %14, %15
+  %17 = select i1 %16, i32* %14, i32* %15
+  %18 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 6), align 8
+  %19 = icmp ugt i32* %17, %18
+  %20 = select i1 %19, i32* %17, i32* %18
+  %21 = load i32*, i32** getelementptr inbounds ([32 x i32*], [32 x i32*]* @arrp, i64 0, i64 7), align 4
+  %22 = icmp ugt i32* %20, %21
+  %23 = select i1 %22, i32* %20, i32* %21
+  ret i32* %23
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
new file mode 100644
index 00000000000..311d8a476c7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -0,0 +1,1887 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx | FileCheck %s --check-prefix=STORE
+
+; #include <stdint.h>
+;
+; int foo(float *A, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += 7*A[i*4  ] +
+;            7*A[i*4+1] +
+;            7*A[i*4+2] +
+;            7*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+define i32 @add_red(float* %A, i32 %n) {
+; CHECK-LABEL: @add_red(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; CHECK-NEXT:    [[ADD28:%.*]] = or i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD28]]
+; CHECK-NEXT:    [[ADD829:%.*]] = or i64 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD829]]
+; CHECK-NEXT:    [[ADD1330:%.*]] = or i64 [[MUL]], 3
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast float [[ADD6]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd fast float [[ADD11]], undef
+; CHECK-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
+; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+; STORE-LABEL: @add_red(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; STORE-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; STORE:       for.body.lr.ph:
+; STORE-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
+; STORE-NEXT:    br label [[FOR_BODY:%.*]]
+; STORE:       for.body:
+; STORE-NEXT:    [[I_033:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[SUM_032:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD17:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_033]], 2
+; STORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; STORE-NEXT:    [[ADD28:%.*]] = or i64 [[MUL]], 1
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD28]]
+; STORE-NEXT:    [[ADD829:%.*]] = or i64 [[MUL]], 2
+; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD829]]
+; STORE-NEXT:    [[ADD1330:%.*]] = or i64 [[MUL]], 3
+; STORE-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
+; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
+; STORE-NEXT:    [[ADD6:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD11:%.*]] = fadd fast float [[ADD6]], undef
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; STORE-NEXT:    [[ADD16:%.*]] = fadd fast float [[ADD11]], undef
+; STORE-NEXT:    [[ADD17]] = fadd fast float [[SUM_032]], [[TMP4]]
+; STORE-NEXT:    [[INC]] = add nsw i64 [[I_033]], 1
+; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
+; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; STORE:       for.cond.for.end_crit_edge:
+; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD17]] to i32
+; STORE-NEXT:    br label [[FOR_END]]
+; STORE:       for.end:
+; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.033 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.032 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add17, %for.body ]
+  %mul = shl nsw i64 %i.033, 2
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %mul
+  %1 = load float, float* %arrayidx, align 4
+  %mul2 = fmul float %1, 7.000000e+00
+  %add28 = or i64 %mul, 1
+  %arrayidx4 = getelementptr inbounds float, float* %A, i64 %add28
+  %2 = load float, float* %arrayidx4, align 4
+  %mul5 = fmul float %2, 7.000000e+00
+  %add6 = fadd fast float %mul2, %mul5
+  %add829 = or i64 %mul, 2
+  %arrayidx9 = getelementptr inbounds float, float* %A, i64 %add829
+  %3 = load float, float* %arrayidx9, align 4
+  %mul10 = fmul float %3, 7.000000e+00
+  %add11 = fadd fast float %add6, %mul10
+  %add1330 = or i64 %mul, 3
+  %arrayidx14 = getelementptr inbounds float, float* %A, i64 %add1330
+  %4 = load float, float* %arrayidx14, align 4
+  %mul15 = fmul float %4, 7.000000e+00
+  %add16 = fadd fast float %add11, %mul15
+  %add17 = fadd fast float %sum.032, %add16
+  %inc = add nsw i64 %i.033, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add17 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum *= B[0]*A[i*4  ] +
+;       B[1]*A[i*4+1] +
+;       B[2]*A[i*4+2] +
+;       B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+define i32 @mul_red(float* noalias %A, float* noalias %B, i32 %n) {
+; CHECK-LABEL: @mul_red(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; CHECK-NEXT:    [[ADD35:%.*]] = or i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
+; CHECK-NEXT:    [[ADD1136:%.*]] = or i64 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1136]]
+; CHECK-NEXT:    [[ADD1737:%.*]] = or i64 [[MUL]], 3
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1737]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP5]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], undef
+; CHECK-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
+; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+; STORE-LABEL: @mul_red(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[CMP38:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; STORE-NEXT:    br i1 [[CMP38]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; STORE:       for.body.lr.ph:
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; STORE-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
+; STORE-NEXT:    br label [[FOR_BODY:%.*]]
+; STORE:       for.body:
+; STORE-NEXT:    [[I_040:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[SUM_039:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[MUL21:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_040]], 2
+; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; STORE-NEXT:    [[ADD35:%.*]] = or i64 [[MUL]], 1
+; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
+; STORE-NEXT:    [[ADD1136:%.*]] = or i64 [[MUL]], 2
+; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1136]]
+; STORE-NEXT:    [[ADD1737:%.*]] = or i64 [[MUL]], 3
+; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1737]]
+; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
+; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; STORE-NEXT:    [[TMP5:%.*]] = fmul <4 x float> [[TMP1]], [[TMP4]]
+; STORE-NEXT:    [[ADD8:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], undef
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP5]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; STORE-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], undef
+; STORE-NEXT:    [[MUL21]] = fmul float [[SUM_039]], [[TMP6]]
+; STORE-NEXT:    [[INC]] = add nsw i64 [[I_040]], 1
+; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
+; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; STORE:       for.cond.for.end_crit_edge:
+; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[MUL21]] to i32
+; STORE-NEXT:    br label [[FOR_END]]
+; STORE:       for.end:
+; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  %cmp38 = icmp sgt i32 %n, 0
+  br i1 %cmp38, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float, float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
+  %1 = load float, float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
+  %2 = load float, float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
+  %3 = load float, float* %arrayidx15, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.040 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.039 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %mul21, %for.body ]
+  %mul = shl nsw i64 %i.040, 2
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
+  %5 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul float %0, %5
+  %add35 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add35
+  %6 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul float %1, %6
+  %add8 = fadd fast float %mul3, %mul7
+  %add1136 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1136
+  %7 = load float, float* %arrayidx12, align 4
+  %mul13 = fmul float %2, %7
+  %add14 = fadd fast float %add8, %mul13
+  %add1737 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1737
+  %8 = load float, float* %arrayidx18, align 4
+  %mul19 = fmul float %3, %8
+  %add20 = fadd fast float %add14, %mul19
+  %mul21 = fmul float %sum.039, %add20
+  %inc = add nsw i64 %i.040, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %mul21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*6  ] +
+;            B[1]*A[i*6+1] +
+;            B[2]*A[i*6+2] +
+;            B[3]*A[i*6+3] +
+;            B[4]*A[i*6+4] +
+;            B[5]*A[i*6+5] +
+;            B[6]*A[i*6+6] +
+;            B[7]*A[i*6+7] +
+;            B[8]*A[i*6+8];
+;   }
+;   return sum;
+; }
+
+define i32 @long_red(float* noalias %A, float* noalias %B, i32 %n) {
+; CHECK-LABEL: @long_red(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, float* [[B]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, float* [[B]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, float* [[B]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; CHECK-NEXT:    [[ADD80:%.*]] = or i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD80]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i64 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD11]]
+; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i64 [[MUL]], 3
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD17]]
+; CHECK-NEXT:    [[ADD23:%.*]] = add nsw i64 [[MUL]], 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD23]]
+; CHECK-NEXT:    [[ADD29:%.*]] = add nsw i64 [[MUL]], 5
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD29]]
+; CHECK-NEXT:    [[ADD35:%.*]] = add nsw i64 [[MUL]], 6
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
+; CHECK-NEXT:    [[ADD41:%.*]] = add nsw i64 [[MUL]], 7
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD41]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float undef, undef
+; CHECK-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], undef
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], undef
+; CHECK-NEXT:    [[ADD26:%.*]] = fadd fast float [[ADD20]], undef
+; CHECK-NEXT:    [[ADD32:%.*]] = fadd fast float [[ADD26]], undef
+; CHECK-NEXT:    [[ADD38:%.*]] = fadd fast float [[ADD32]], undef
+; CHECK-NEXT:    [[ADD44:%.*]] = fadd fast float [[ADD38]], undef
+; CHECK-NEXT:    [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
+; CHECK-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP6]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
+; CHECK-NEXT:    [[ADD50:%.*]] = fadd fast float [[ADD44]], [[MUL49]]
+; CHECK-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
+; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+; STORE-LABEL: @long_red(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[CMP81:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; STORE-NEXT:    br i1 [[CMP81]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; STORE:       for.body.lr.ph:
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; STORE-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; STORE-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds float, float* [[B]], i64 4
+; STORE-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
+; STORE-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds float, float* [[B]], i64 6
+; STORE-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds float, float* [[B]], i64 7
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <8 x float>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; STORE-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds float, float* [[B]], i64 8
+; STORE-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX45]], align 4
+; STORE-NEXT:    [[TMP3:%.*]] = sext i32 [[N]] to i64
+; STORE-NEXT:    br label [[FOR_BODY:%.*]]
+; STORE:       for.body:
+; STORE-NEXT:    [[I_083:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[SUM_082:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD51:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_083]], 6
+; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; STORE-NEXT:    [[ADD80:%.*]] = or i64 [[MUL]], 1
+; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD80]]
+; STORE-NEXT:    [[ADD11:%.*]] = add nsw i64 [[MUL]], 2
+; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD11]]
+; STORE-NEXT:    [[ADD17:%.*]] = add nsw i64 [[MUL]], 3
+; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD17]]
+; STORE-NEXT:    [[ADD23:%.*]] = add nsw i64 [[MUL]], 4
+; STORE-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD23]]
+; STORE-NEXT:    [[ADD29:%.*]] = add nsw i64 [[MUL]], 5
+; STORE-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD29]]
+; STORE-NEXT:    [[ADD35:%.*]] = add nsw i64 [[MUL]], 6
+; STORE-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD35]]
+; STORE-NEXT:    [[ADD41:%.*]] = add nsw i64 [[MUL]], 7
+; STORE-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD41]]
+; STORE-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX2]] to <8 x float>*
+; STORE-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; STORE-NEXT:    [[TMP6:%.*]] = fmul fast <8 x float> [[TMP1]], [[TMP5]]
+; STORE-NEXT:    [[ADD8:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], undef
+; STORE-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], undef
+; STORE-NEXT:    [[ADD26:%.*]] = fadd fast float [[ADD20]], undef
+; STORE-NEXT:    [[ADD32:%.*]] = fadd fast float [[ADD26]], undef
+; STORE-NEXT:    [[ADD38:%.*]] = fadd fast float [[ADD32]], undef
+; STORE-NEXT:    [[ADD44:%.*]] = fadd fast float [[ADD38]], undef
+; STORE-NEXT:    [[ADD47:%.*]] = add nsw i64 [[MUL]], 8
+; STORE-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD47]]
+; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX48]], align 4
+; STORE-NEXT:    [[MUL49:%.*]] = fmul fast float [[TMP2]], [[TMP7]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP6]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP8]], [[MUL49]]
+; STORE-NEXT:    [[ADD50:%.*]] = fadd fast float [[ADD44]], [[MUL49]]
+; STORE-NEXT:    [[ADD51]] = fadd fast float [[SUM_082]], [[TMP9]]
+; STORE-NEXT:    [[INC]] = add nsw i64 [[I_083]], 1
+; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP3]]
+; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; STORE:       for.cond.for.end_crit_edge:
+; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[ADD51]] to i32
+; STORE-NEXT:    br label [[FOR_END]]
+; STORE:       for.end:
+; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  %cmp81 = icmp sgt i32 %n, 0
+  br i1 %cmp81, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float, float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
+  %1 = load float, float* %arrayidx4, align 4
+  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
+  %2 = load float, float* %arrayidx9, align 4
+  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
+  %3 = load float, float* %arrayidx15, align 4
+  %arrayidx21 = getelementptr inbounds float, float* %B, i64 4
+  %4 = load float, float* %arrayidx21, align 4
+  %arrayidx27 = getelementptr inbounds float, float* %B, i64 5
+  %5 = load float, float* %arrayidx27, align 4
+  %arrayidx33 = getelementptr inbounds float, float* %B, i64 6
+  %6 = load float, float* %arrayidx33, align 4
+  %arrayidx39 = getelementptr inbounds float, float* %B, i64 7
+  %7 = load float, float* %arrayidx39, align 4
+  %arrayidx45 = getelementptr inbounds float, float* %B, i64 8
+  %8 = load float, float* %arrayidx45, align 4
+  %9 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.083 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.082 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add51, %for.body ]
+  %mul = mul nsw i64 %i.083, 6
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
+  %10 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %10
+  %add80 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add80
+  %11 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %1, %11
+  %add8 = fadd fast float %mul3, %mul7
+  %add11 = add nsw i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add11
+  %12 = load float, float* %arrayidx12, align 4
+  %mul13 = fmul fast float %2, %12
+  %add14 = fadd fast float %add8, %mul13
+  %add17 = add nsw i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add17
+  %13 = load float, float* %arrayidx18, align 4
+  %mul19 = fmul fast float %3, %13
+  %add20 = fadd fast float %add14, %mul19
+  %add23 = add nsw i64 %mul, 4
+  %arrayidx24 = getelementptr inbounds float, float* %A, i64 %add23
+  %14 = load float, float* %arrayidx24, align 4
+  %mul25 = fmul fast float %4, %14
+  %add26 = fadd fast float %add20, %mul25
+  %add29 = add nsw i64 %mul, 5
+  %arrayidx30 = getelementptr inbounds float, float* %A, i64 %add29
+  %15 = load float, float* %arrayidx30, align 4
+  %mul31 = fmul fast float %5, %15
+  %add32 = fadd fast float %add26, %mul31
+  %add35 = add nsw i64 %mul, 6
+  %arrayidx36 = getelementptr inbounds float, float* %A, i64 %add35
+  %16 = load float, float* %arrayidx36, align 4
+  %mul37 = fmul fast float %6, %16
+  %add38 = fadd fast float %add32, %mul37
+  %add41 = add nsw i64 %mul, 7
+  %arrayidx42 = getelementptr inbounds float, float* %A, i64 %add41
+  %17 = load float, float* %arrayidx42, align 4
+  %mul43 = fmul fast float %7, %17
+  %add44 = fadd fast float %add38, %mul43
+  %add47 = add nsw i64 %mul, 8
+  %arrayidx48 = getelementptr inbounds float, float* %A, i64 %add47
+  %18 = load float, float* %arrayidx48, align 4
+  %mul49 = fmul fast float %8, %18
+  %add50 = fadd fast float %add44, %mul49
+  %add51 = fadd fast float %sum.082, %add50
+  %inc = add nsw i64 %i.083, 1
+  %exitcond = icmp eq i64 %inc, %9
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add51 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; int foo(float * restrict A, float * restrict B, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     sum += B[0]*A[i*4  ];
+;     sum += B[1]*A[i*4+1];
+;     sum += B[2]*A[i*4+2];
+;     sum += B[3]*A[i*4+3];
+;   }
+;   return sum;
+; }
+
+define i32 @chain_red(float* noalias %A, float* noalias %B, i32 %n) {
+; CHECK-LABEL: @chain_red(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; CHECK-NEXT:    [[ADD638:%.*]] = or i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD638]]
+; CHECK-NEXT:    [[ADD1239:%.*]] = or i64 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1239]]
+; CHECK-NEXT:    [[ADD1840:%.*]] = or i64 [[MUL]], 3
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1840]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[SUM_042]], undef
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[ADD]], undef
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd fast float [[ADD9]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP5]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
+; CHECK-NEXT:    [[ADD21:%.*]] = fadd fast float [[ADD15]], undef
+; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi float [[OP_EXTRA]] to i32
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+; STORE-LABEL: @chain_red(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[CMP41:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; STORE-NEXT:    br i1 [[CMP41]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; STORE:       for.body.lr.ph:
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; STORE-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; STORE-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast float* [[B]] to <4 x float>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
+; STORE-NEXT:    br label [[FOR_BODY:%.*]]
+; STORE:       for.body:
+; STORE-NEXT:    [[I_043:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[SUM_042:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_043]], 2
+; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; STORE-NEXT:    [[ADD638:%.*]] = or i64 [[MUL]], 1
+; STORE-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD638]]
+; STORE-NEXT:    [[ADD1239:%.*]] = or i64 [[MUL]], 2
+; STORE-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1239]]
+; STORE-NEXT:    [[ADD1840:%.*]] = or i64 [[MUL]], 3
+; STORE-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1840]]
+; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
+; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP4]]
+; STORE-NEXT:    [[ADD:%.*]] = fadd fast float [[SUM_042]], undef
+; STORE-NEXT:    [[ADD9:%.*]] = fadd fast float [[ADD]], undef
+; STORE-NEXT:    [[ADD15:%.*]] = fadd fast float [[ADD9]], undef
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP5]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; STORE-NEXT:    [[OP_EXTRA]] = fadd fast float [[TMP6]], [[SUM_042]]
+; STORE-NEXT:    [[ADD21:%.*]] = fadd fast float [[ADD15]], undef
+; STORE-NEXT:    [[INC]] = add nsw i64 [[I_043]], 1
+; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
+; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; STORE:       for.cond.for.end_crit_edge:
+; STORE-NEXT:    [[PHITMP:%.*]] = fptosi float [[OP_EXTRA]] to i32
+; STORE-NEXT:    br label [[FOR_END]]
+; STORE:       for.end:
+; STORE-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
+; STORE-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  %cmp41 = icmp sgt i32 %n, 0
+  br i1 %cmp41, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load float, float* %B, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
+  %1 = load float, float* %arrayidx4, align 4
+  %arrayidx10 = getelementptr inbounds float, float* %B, i64 2
+  %2 = load float, float* %arrayidx10, align 4
+  %arrayidx16 = getelementptr inbounds float, float* %B, i64 3
+  %3 = load float, float* %arrayidx16, align 4
+  %4 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.043 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %sum.042 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add21, %for.body ]
+  %mul = shl nsw i64 %i.043, 2
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
+  %5 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %5
+  %add = fadd fast float %sum.042, %mul3
+  %add638 = or i64 %mul, 1
+  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %add638
+  %6 = load float, float* %arrayidx7, align 4
+  %mul8 = fmul fast float %1, %6
+  %add9 = fadd fast float %add, %mul8
+  %add1239 = or i64 %mul, 2
+  %arrayidx13 = getelementptr inbounds float, float* %A, i64 %add1239
+  %7 = load float, float* %arrayidx13, align 4
+  %mul14 = fmul fast float %2, %7
+  %add15 = fadd fast float %add9, %mul14
+  %add1840 = or i64 %mul, 3
+  %arrayidx19 = getelementptr inbounds float, float* %A, i64 %add1840
+  %8 = load float, float* %arrayidx19, align 4
+  %mul20 = fmul fast float %3, %8
+  %add21 = fadd fast float %add15, %mul20
+  %inc = add nsw i64 %i.043, 1
+  %exitcond = icmp eq i64 %inc, %4
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %phitmp = fptosi float %add21 to i32
+  br label %for.end
+
+for.end:
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
+; void foo(const float *arg_A, unsigned arg_B, float *array) {
+;   for (uint32_t i = 0; i < 6; ++i) {
+;     const float *ptr = arg_A + i;
+;     float w0 = array[i * 4 + 0];
+;     float w1 = array[i * 4 + 1];
+;     float w2 = array[i * 4 + 2];
+;     float w3 = array[i * 4 + 3];
+;
+;     for (unsigned j = 0; j < arg_B; ++j) {
+;       const float x1 = *ptr - (-1.1f * w0) - (1.2f * w1);
+;       const float x2 = (2.1f * x1) + (-2.2f * w0) + (2.3f * w1);
+;       const float x3 = x2 - (-3.1f * w2) - (3.2f * w3);
+;       const float x4 = x3 + (-4.0f * w2) + w3;
+;       w1 = w0;
+;       w0 = x1;
+;       w3 = w2;
+;       w2 = x3;
+;     }
+;
+;     array[i * 4 + 0] = w0;
+;     array[i * 4 + 1] = w1;
+;     array[i * 4 + 2] = w2;
+;     array[i * 4 + 3] = w3;
+;   }
+; }
+
+define void @foo(float* nocapture readonly %arg_A, i32 %arg_B, float* nocapture %array) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
+; CHECK:       for.body16.lr.ph:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
+; CHECK:       for.cond.cleanup15:
+; CHECK-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
+; CHECK-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.body16:
+; CHECK-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; CHECK-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; CHECK-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; CHECK-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; CHECK-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
+; CHECK-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
+; CHECK-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; CHECK-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; CHECK-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; CHECK-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; CHECK-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; CHECK-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; CHECK-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; CHECK-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
+;
+; STORE-LABEL: @foo(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[CMP1495:%.*]] = icmp eq i32 [[ARG_B:%.*]], 0
+; STORE-NEXT:    br label [[FOR_BODY:%.*]]
+; STORE:       for.cond.cleanup:
+; STORE-NEXT:    ret void
+; STORE:       for.body:
+; STORE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND_CLEANUP15:%.*]] ]
+; STORE-NEXT:    [[TMP0:%.*]] = shl i64 [[INDVARS_IV]], 2
+; STORE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[ARRAY:%.*]], i64 [[TMP0]]
+; STORE-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; STORE-NEXT:    [[TMP2:%.*]] = or i64 [[TMP0]], 1
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP2]]
+; STORE-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; STORE-NEXT:    [[TMP4:%.*]] = or i64 [[TMP0]], 2
+; STORE-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP4]]
+; STORE-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX8]], align 4
+; STORE-NEXT:    [[TMP6:%.*]] = or i64 [[TMP0]], 3
+; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[ARRAY]], i64 [[TMP6]]
+; STORE-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; STORE-NEXT:    br i1 [[CMP1495]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16_LR_PH:%.*]]
+; STORE:       for.body16.lr.ph:
+; STORE-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[ARG_A:%.*]], i64 [[INDVARS_IV]]
+; STORE-NEXT:    [[TMP8:%.*]] = load float, float* [[ADD_PTR]], align 4
+; STORE-NEXT:    br label [[FOR_BODY16:%.*]]
+; STORE:       for.cond.cleanup15:
+; STORE-NEXT:    [[W2_0_LCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ], [ [[SUB28:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W3_0_LCSSA:%.*]] = phi float [ [[TMP7]], [[FOR_BODY]] ], [ [[W2_096:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W1_0_LCSSA:%.*]] = phi float [ [[TMP3]], [[FOR_BODY]] ], [ [[W0_0100:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W0_0_LCSSA:%.*]] = phi float [ [[TMP1]], [[FOR_BODY]] ], [ [[SUB19:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    store float [[W0_0_LCSSA]], float* [[ARRAYIDX]], align 4
+; STORE-NEXT:    store float [[W1_0_LCSSA]], float* [[ARRAYIDX4]], align 4
+; STORE-NEXT:    store float [[W2_0_LCSSA]], float* [[ARRAYIDX8]], align 4
+; STORE-NEXT:    store float [[W3_0_LCSSA]], float* [[ARRAYIDX12]], align 4
+; STORE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; STORE-NEXT:    [[EXITCOND109:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 6
+; STORE-NEXT:    br i1 [[EXITCOND109]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; STORE:       for.body16:
+; STORE-NEXT:    [[W0_0100]] = phi float [ [[TMP1]], [[FOR_BODY16_LR_PH]] ], [ [[SUB19]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W1_099:%.*]] = phi float [ [[TMP3]], [[FOR_BODY16_LR_PH]] ], [ [[W0_0100]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[J_098:%.*]] = phi i32 [ 0, [[FOR_BODY16_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W3_097:%.*]] = phi float [ [[TMP7]], [[FOR_BODY16_LR_PH]] ], [ [[W2_096]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[W2_096]] = phi float [ [[TMP5]], [[FOR_BODY16_LR_PH]] ], [ [[SUB28]], [[FOR_BODY16]] ]
+; STORE-NEXT:    [[MUL17:%.*]] = fmul fast float [[W0_0100]], 0x3FF19999A0000000
+; STORE-NEXT:    [[MUL18_NEG:%.*]] = fmul fast float [[W1_099]], 0xBFF3333340000000
+; STORE-NEXT:    [[SUB92:%.*]] = fadd fast float [[MUL17]], [[MUL18_NEG]]
+; STORE-NEXT:    [[SUB19]] = fadd fast float [[SUB92]], [[TMP8]]
+; STORE-NEXT:    [[MUL20:%.*]] = fmul fast float [[SUB19]], 0x4000CCCCC0000000
+; STORE-NEXT:    [[MUL21_NEG:%.*]] = fmul fast float [[W0_0100]], 0xC0019999A0000000
+; STORE-NEXT:    [[MUL23:%.*]] = fmul fast float [[W1_099]], 0x4002666660000000
+; STORE-NEXT:    [[MUL25:%.*]] = fmul fast float [[W2_096]], 0x4008CCCCC0000000
+; STORE-NEXT:    [[MUL27_NEG:%.*]] = fmul fast float [[W3_097]], 0xC0099999A0000000
+; STORE-NEXT:    [[ADD2293:%.*]] = fadd fast float [[MUL27_NEG]], [[MUL25]]
+; STORE-NEXT:    [[ADD24:%.*]] = fadd fast float [[ADD2293]], [[MUL23]]
+; STORE-NEXT:    [[SUB2694:%.*]] = fadd fast float [[ADD24]], [[MUL21_NEG]]
+; STORE-NEXT:    [[SUB28]] = fadd fast float [[SUB2694]], [[MUL20]]
+; STORE-NEXT:    [[INC]] = add nuw i32 [[J_098]], 1
+; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[ARG_B]]
+; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP15]], label [[FOR_BODY16]]
+;
+entry:
+  %cmp1495 = icmp eq i32 %arg_B, 0
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup15
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup15, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond.cleanup15 ]
+  %0 = shl i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds float, float* %array, i64 %0
+  %1 = load float, float* %arrayidx, align 4
+  %2 = or i64 %0, 1
+  %arrayidx4 = getelementptr inbounds float, float* %array, i64 %2
+  %3 = load float, float* %arrayidx4, align 4
+  %4 = or i64 %0, 2
+  %arrayidx8 = getelementptr inbounds float, float* %array, i64 %4
+  %5 = load float, float* %arrayidx8, align 4
+  %6 = or i64 %0, 3
+  %arrayidx12 = getelementptr inbounds float, float* %array, i64 %6
+  %7 = load float, float* %arrayidx12, align 4
+  br i1 %cmp1495, label %for.cond.cleanup15, label %for.body16.lr.ph
+
+for.body16.lr.ph:                                 ; preds = %for.body
+  %add.ptr = getelementptr inbounds float, float* %arg_A, i64 %indvars.iv
+  %8 = load float, float* %add.ptr, align 4
+  br label %for.body16
+
+for.cond.cleanup15:                               ; preds = %for.body16, %for.body
+  %w2.0.lcssa = phi float [ %5, %for.body ], [ %sub28, %for.body16 ]
+  %w3.0.lcssa = phi float [ %7, %for.body ], [ %w2.096, %for.body16 ]
+  %w1.0.lcssa = phi float [ %3, %for.body ], [ %w0.0100, %for.body16 ]
+  %w0.0.lcssa = phi float [ %1, %for.body ], [ %sub19, %for.body16 ]
+  store float %w0.0.lcssa, float* %arrayidx, align 4
+  store float %w1.0.lcssa, float* %arrayidx4, align 4
+  store float %w2.0.lcssa, float* %arrayidx8, align 4
+  store float %w3.0.lcssa, float* %arrayidx12, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond109 = icmp eq i64 %indvars.iv.next, 6
+  br i1 %exitcond109, label %for.cond.cleanup, label %for.body
+
+for.body16:                                       ; preds = %for.body16, %for.body16.lr.ph
+  %w0.0100 = phi float [ %1, %for.body16.lr.ph ], [ %sub19, %for.body16 ]
+  %w1.099 = phi float [ %3, %for.body16.lr.ph ], [ %w0.0100, %for.body16 ]
+  %j.098 = phi i32 [ 0, %for.body16.lr.ph ], [ %inc, %for.body16 ]
+  %w3.097 = phi float [ %7, %for.body16.lr.ph ], [ %w2.096, %for.body16 ]
+  %w2.096 = phi float [ %5, %for.body16.lr.ph ], [ %sub28, %for.body16 ]
+  %mul17 = fmul fast float %w0.0100, 0x3FF19999A0000000
+  %mul18.neg = fmul fast float %w1.099, 0xBFF3333340000000
+  %sub92 = fadd fast float %mul17, %mul18.neg
+  %sub19 = fadd fast float %sub92, %8
+  %mul20 = fmul fast float %sub19, 0x4000CCCCC0000000
+  %mul21.neg = fmul fast float %w0.0100, 0xC0019999A0000000
+  %mul23 = fmul fast float %w1.099, 0x4002666660000000
+  %mul25 = fmul fast float %w2.096, 0x4008CCCCC0000000
+  %mul27.neg = fmul fast float %w3.097, 0xC0099999A0000000
+  %add2293 = fadd fast float %mul27.neg, %mul25
+  %add24 = fadd fast float %add2293, %mul23
+  %sub2694 = fadd fast float %add24, %mul21.neg
+  %sub28 = fadd fast float %sub2694, %mul20
+  %inc = add nuw i32 %j.098, 1
+  %exitcond = icmp eq i32 %inc, %arg_B
+  br i1 %exitcond, label %for.cond.cleanup15, label %for.body16
+}
+
+
+; void foo(double * restrict A, double * restrict B, double * restrict C,
+;          int n) {
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] + B[1] *A[i*4+1];
+;   }
+; }
+
+define void @store_red_double(double* noalias %A, double* noalias %B, double* noalias %C, i32 %n) {
+; CHECK-LABEL: @store_red_double(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double [[TMP0]], [[TMP3]]
+; CHECK-NEXT:    [[ADD16:%.*]] = or i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast double [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast double [[MUL3]], [[MUL7]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
+; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @store_red_double(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; STORE-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; STORE:       for.body.lr.ph:
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1
+; STORE-NEXT:    [[TMP0:%.*]] = bitcast double* [[B]] to <2 x double>*
+; STORE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; STORE-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
+; STORE-NEXT:    br label [[FOR_BODY:%.*]]
+; STORE:       for.body:
+; STORE-NEXT:    [[I_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_018]], 2
+; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[MUL]]
+; STORE-NEXT:    [[ADD16:%.*]] = or i64 [[MUL]], 1
+; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[ADD16]]
+; STORE-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>*
+; STORE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]]
+; STORE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+; STORE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; STORE-NEXT:    [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]]
+; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]]
+; STORE-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
+; STORE-NEXT:    [[INC]] = add nsw i64 [[I_018]], 1
+; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]]
+; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; STORE:       for.end:
+; STORE-NEXT:    ret void
+;
+entry:
+  %cmp17 = icmp sgt i32 %n, 0
+  br i1 %cmp17, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %0 = load double, double* %B, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %B, i64 1
+  %1 = load double, double* %arrayidx4, align 8
+  %2 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.018 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %mul = shl nsw i64 %i.018, 2
+  %arrayidx2 = getelementptr inbounds double, double* %A, i64 %mul
+  %3 = load double, double* %arrayidx2, align 8
+  %mul3 = fmul fast double %0, %3
+  %add16 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %add16
+  %4 = load double, double* %arrayidx6, align 8
+  %mul7 = fmul fast double %1, %4
+  %add8 = fadd fast double %mul3, %mul7
+  %arrayidx9 = getelementptr inbounds double, double* %C, i64 %i.018
+  store double %add8, double* %arrayidx9, align 8
+  %inc = add nsw i64 %i.018, 1
+  %exitcond = icmp eq i64 %inc, %2
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; int foo(float * restrict A, float * restrict B, float * restrict C, int n) {
+;   float sum = 0;
+;   for (intptr_t i=0; i < n; ++i) {
+;     C[i] = B[0] *A[i*4  ] +
+;          B[1] *A[i*4+1] +
+;          B[2] *A[i*4+2] +
+;          B[3] *A[i*4+3];
+;   }
+;   return sum;
+; }
+
+define i32 @store_red(float* noalias %A, float* noalias %B, float* noalias %C, i32 %n) {
+; CHECK-LABEL: @store_red(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[B]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast float [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ADD34:%.*]] = or i64 [[MUL]], 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[MUL7:%.*]] = fmul fast float [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float [[MUL3]], [[MUL7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ADD1135:%.*]] = or i64 [[MUL]], 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul fast float [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], [[MUL13]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX15]], align 4
+; CHECK-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX18]], align 4
+; CHECK-NEXT:    [[MUL19:%.*]] = fmul fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], [[MUL19]]
+; CHECK-NEXT:    store float [[ADD20]], float* [[C_ADDR_038]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
+; CHECK-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+; STORE-LABEL: @store_red(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[CMP37:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; STORE-NEXT:    br i1 [[CMP37]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; STORE:       for.body.lr.ph:
+; STORE-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1
+; STORE-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; STORE-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; STORE-NEXT:    [[TMP0:%.*]] = sext i32 [[N]] to i64
+; STORE-NEXT:    br label [[FOR_BODY:%.*]]
+; STORE:       for.body:
+; STORE-NEXT:    [[I_039:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[C_ADDR_038:%.*]] = phi float* [ [[C:%.*]], [[FOR_BODY_LR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
+; STORE-NEXT:    [[MUL:%.*]] = shl nsw i64 [[I_039]], 2
+; STORE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[MUL]]
+; STORE-NEXT:    [[ADD34:%.*]] = or i64 [[MUL]], 1
+; STORE-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]]
+; STORE-NEXT:    [[ADD1135:%.*]] = or i64 [[MUL]], 2
+; STORE-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]]
+; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>*
+; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; STORE-NEXT:    [[ADD1736:%.*]] = or i64 [[MUL]], 3
+; STORE-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]]
+; STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>*
+; STORE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; STORE-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]]
+; STORE-NEXT:    [[ADD8:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD14:%.*]] = fadd fast float [[ADD8]], undef
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP5]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; STORE-NEXT:    [[ADD20:%.*]] = fadd fast float [[ADD14]], undef
+; STORE-NEXT:    store float [[TMP6]], float* [[C_ADDR_038]], align 4
+; STORE-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, float* [[C_ADDR_038]], i64 1
+; STORE-NEXT:    [[INC]] = add nsw i64 [[I_039]], 1
+; STORE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
+; STORE-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; STORE:       for.end:
+; STORE-NEXT:    ret i32 0
+;
+entry:
+  %cmp37 = icmp sgt i32 %n, 0
+  br i1 %cmp37, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 1
+  %arrayidx9 = getelementptr inbounds float, float* %B, i64 2
+  %arrayidx15 = getelementptr inbounds float, float* %B, i64 3
+  %0 = sext i32 %n to i64
+  br label %for.body
+
+for.body:
+  %i.039 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %C.addr.038 = phi float* [ %C, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %1 = load float, float* %B, align 4
+  %mul = shl nsw i64 %i.039, 2
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 %mul
+  %2 = load float, float* %arrayidx2, align 4
+  %mul3 = fmul fast float %1, %2
+  %3 = load float, float* %arrayidx4, align 4
+  %add34 = or i64 %mul, 1
+  %arrayidx6 = getelementptr inbounds float, float* %A, i64 %add34
+  %4 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul fast float %3, %4
+  %add8 = fadd fast float %mul3, %mul7
+  %5 = load float, float* %arrayidx9, align 4
+  %add1135 = or i64 %mul, 2
+  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %add1135
+  %6 = load float, float* %arrayidx12, align 4
+  %mul13 = fmul fast float %5, %6
+  %add14 = fadd fast float %add8, %mul13
+  %7 = load float, float* %arrayidx15, align 4
+  %add1736 = or i64 %mul, 3
+  %arrayidx18 = getelementptr inbounds float, float* %A, i64 %add1736
+  %8 = load float, float* %arrayidx18, align 4
+  %mul19 = fmul fast float %7, %8
+  %add20 = fadd fast float %add14, %mul19
+  store float %add20, float* %C.addr.038, align 4
+  %incdec.ptr = getelementptr inbounds float, float* %C.addr.038, i64 1
+  %inc = add nsw i64 %i.039, 1
+  %exitcond = icmp eq i64 %inc, %0
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 0
+}
+
+@arr_i32 = global [32 x i32] zeroinitializer, align 16
+@arr_float = global [32 x float] zeroinitializer, align 16
+
+define void @float_red_example4(float* %res) {
+; CHECK-LABEL: @float_red_example4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    store float [[ADD_2]], float* [[RES:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @float_red_example4(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* bitcast ([32 x float]* @arr_float to <4 x float>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; STORE-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
+  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
+  %add = fadd fast float %1, %0
+  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
+  %add.1 = fadd fast float %2, %add
+  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
+  %add.2 = fadd fast float %3, %add.1
+  store float %add.2, float* %res, align 16
+  ret void
+}
+
+define void @float_red_example8(float* %res) {
+; CHECK-LABEL: @float_red_example8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    store float [[ADD_6]], float* [[RES:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @float_red_example8(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <8 x float>, <8 x float>* bitcast ([32 x float]* @arr_float to <8 x float>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; STORE-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; STORE-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; STORE-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; STORE-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; STORE-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
+  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
+  %add = fadd fast float %1, %0
+  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
+  %add.1 = fadd fast float %2, %add
+  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
+  %add.2 = fadd fast float %3, %add.1
+  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
+  %add.3 = fadd fast float %4, %add.2
+  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
+  %add.4 = fadd fast float %5, %add.3
+  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
+  %add.5 = fadd fast float %6, %add.4
+  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
+  %add.6 = fadd fast float %7, %add.5
+  store float %add.6, float* %res, align 16
+  ret void
+}
+
+define void @float_red_example16(float* %res) {
+; CHECK-LABEL: @float_red_example16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float [[TMP8]], [[ADD_6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float [[TMP9]], [[ADD_7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float [[TMP10]], [[ADD_8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float [[TMP11]], [[ADD_9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float [[TMP12]], [[ADD_10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float [[TMP13]], [[ADD_11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float [[TMP14]], [[ADD_12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float [[TMP15]], [[ADD_13]]
+; CHECK-NEXT:    store float [[ADD_14]], float* [[RES:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @float_red_example16(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <16 x float>, <16 x float>* bitcast ([32 x float]* @arr_float to <16 x float>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = fadd fast float undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; STORE-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; STORE-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; STORE-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; STORE-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; STORE-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; STORE-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; STORE-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; STORE-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; STORE-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; STORE-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; STORE-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; STORE-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; STORE-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; STORE-NEXT:    store float [[TMP1]], float* [[RES:%.*]], align 16
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 0), align 16
+  %1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 1), align 4
+  %add = fadd fast float %1, %0
+  %2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 2), align 8
+  %add.1 = fadd fast float %2, %add
+  %3 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 3), align 4
+  %add.2 = fadd fast float %3, %add.1
+  %4 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 4), align 16
+  %add.3 = fadd fast float %4, %add.2
+  %5 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 5), align 4
+  %add.4 = fadd fast float %5, %add.3
+  %6 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 6), align 8
+  %add.5 = fadd fast float %6, %add.4
+  %7 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 7), align 4
+  %add.6 = fadd fast float %7, %add.5
+  %8 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 8), align 16
+  %add.7 = fadd fast float %8, %add.6
+  %9 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 9), align 4
+  %add.8 = fadd fast float %9, %add.7
+  %10 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 10), align 8
+  %add.9 = fadd fast float %10, %add.8
+  %11 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 11), align 4
+  %add.10 = fadd fast float %11, %add.9
+  %12 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 12), align 16
+  %add.11 = fadd fast float %12, %add.10
+  %13 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 13), align 4
+  %add.12 = fadd fast float %13, %add.11
+  %14 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 14), align 8
+  %add.13 = fadd fast float %14, %add.12
+  %15 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr_float, i64 0, i64 15), align 4
+  %add.14 = fadd fast float %15, %add.13
+  store float %add.14, float* %res, align 16
+  ret void
+}
+
+define void @i32_red_example4(i32* %res) {
+; CHECK-LABEL: @i32_red_example4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    store i32 [[ADD_2]], i32* [[RES:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @i32_red_example4(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  store i32 %add.2, i32* %res, align 16
+  ret void
+}
+
+define void @i32_red_example8(i32* %res) {
+; CHECK-LABEL: @i32_red_example8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    store i32 [[ADD_6]], i32* [[RES:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @i32_red_example8(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; STORE-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; STORE-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; STORE-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; STORE-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  store i32 %add.6, i32* %res, align 16
+  ret void
+}
+
+define void @i32_red_example16(i32* %res) {
+; CHECK-LABEL: @i32_red_example16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
+; CHECK-NEXT:    [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
+; CHECK-NEXT:    [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
+; CHECK-NEXT:    [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
+; CHECK-NEXT:    [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
+; CHECK-NEXT:    [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
+; CHECK-NEXT:    [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
+; CHECK-NEXT:    [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
+; CHECK-NEXT:    store i32 [[ADD_14]], i32* [[RES:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @i32_red_example16(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; STORE-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; STORE-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; STORE-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; STORE-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; STORE-NEXT:    [[ADD_7:%.*]] = add nsw i32 undef, [[ADD_6]]
+; STORE-NEXT:    [[ADD_8:%.*]] = add nsw i32 undef, [[ADD_7]]
+; STORE-NEXT:    [[ADD_9:%.*]] = add nsw i32 undef, [[ADD_8]]
+; STORE-NEXT:    [[ADD_10:%.*]] = add nsw i32 undef, [[ADD_9]]
+; STORE-NEXT:    [[ADD_11:%.*]] = add nsw i32 undef, [[ADD_10]]
+; STORE-NEXT:    [[ADD_12:%.*]] = add nsw i32 undef, [[ADD_11]]
+; STORE-NEXT:    [[ADD_13:%.*]] = add nsw i32 undef, [[ADD_12]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <16 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX6:%.*]] = add nsw <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
+; STORE-NEXT:    [[ADD_14:%.*]] = add nsw i32 undef, [[ADD_13]]
+; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
+  %add.7 = add nsw i32 %8, %add.6
+  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
+  %add.8 = add nsw i32 %9, %add.7
+  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
+  %add.9 = add nsw i32 %10, %add.8
+  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
+  %add.10 = add nsw i32 %11, %add.9
+  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
+  %add.11 = add nsw i32 %12, %add.10
+  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
+  %add.12 = add nsw i32 %13, %add.11
+  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
+  %add.13 = add nsw i32 %14, %add.12
+  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
+  %add.14 = add nsw i32 %15, %add.13
+  store i32 %add.14, i32* %res, align 16
+  ret void
+}
+
+define void @i32_red_example32(i32* %res) {
+; CHECK-LABEL: @i32_red_example32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nsw i32 [[TMP8]], [[ADD_6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
+; CHECK-NEXT:    [[ADD_8:%.*]] = add nsw i32 [[TMP9]], [[ADD_7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
+; CHECK-NEXT:    [[ADD_9:%.*]] = add nsw i32 [[TMP10]], [[ADD_8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
+; CHECK-NEXT:    [[ADD_10:%.*]] = add nsw i32 [[TMP11]], [[ADD_9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
+; CHECK-NEXT:    [[ADD_11:%.*]] = add nsw i32 [[TMP12]], [[ADD_10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
+; CHECK-NEXT:    [[ADD_12:%.*]] = add nsw i32 [[TMP13]], [[ADD_11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
+; CHECK-NEXT:    [[ADD_13:%.*]] = add nsw i32 [[TMP14]], [[ADD_12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
+; CHECK-NEXT:    [[ADD_14:%.*]] = add nsw i32 [[TMP15]], [[ADD_13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
+; CHECK-NEXT:    [[ADD_15:%.*]] = add nsw i32 [[TMP16]], [[ADD_14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
+; CHECK-NEXT:    [[ADD_16:%.*]] = add nsw i32 [[TMP17]], [[ADD_15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
+; CHECK-NEXT:    [[ADD_17:%.*]] = add nsw i32 [[TMP18]], [[ADD_16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
+; CHECK-NEXT:    [[ADD_18:%.*]] = add nsw i32 [[TMP19]], [[ADD_17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
+; CHECK-NEXT:    [[ADD_19:%.*]] = add nsw i32 [[TMP20]], [[ADD_18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
+; CHECK-NEXT:    [[ADD_20:%.*]] = add nsw i32 [[TMP21]], [[ADD_19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
+; CHECK-NEXT:    [[ADD_21:%.*]] = add nsw i32 [[TMP22]], [[ADD_20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
+; CHECK-NEXT:    [[ADD_22:%.*]] = add nsw i32 [[TMP23]], [[ADD_21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
+; CHECK-NEXT:    [[ADD_23:%.*]] = add nsw i32 [[TMP24]], [[ADD_22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
+; CHECK-NEXT:    [[ADD_24:%.*]] = add nsw i32 [[TMP25]], [[ADD_23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
+; CHECK-NEXT:    [[ADD_25:%.*]] = add nsw i32 [[TMP26]], [[ADD_24]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
+; CHECK-NEXT:    [[ADD_26:%.*]] = add nsw i32 [[TMP27]], [[ADD_25]]
+; CHECK-NEXT:    [[TMP28:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
+; CHECK-NEXT:    [[ADD_27:%.*]] = add nsw i32 [[TMP28]], [[ADD_26]]
+; CHECK-NEXT:    [[TMP29:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
+; CHECK-NEXT:    [[ADD_28:%.*]] = add nsw i32 [[TMP29]], [[ADD_27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
+; CHECK-NEXT:    [[ADD_29:%.*]] = add nsw i32 [[TMP30]], [[ADD_28]]
+; CHECK-NEXT:    [[TMP31:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
+; CHECK-NEXT:    [[ADD_30:%.*]] = add nsw i32 [[TMP31]], [[ADD_29]]
+; CHECK-NEXT:    store i32 [[ADD_30]], i32* [[RES:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @i32_red_example32(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; STORE-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; STORE-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; STORE-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; STORE-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; STORE-NEXT:    [[ADD_7:%.*]] = add nsw i32 undef, [[ADD_6]]
+; STORE-NEXT:    [[ADD_8:%.*]] = add nsw i32 undef, [[ADD_7]]
+; STORE-NEXT:    [[ADD_9:%.*]] = add nsw i32 undef, [[ADD_8]]
+; STORE-NEXT:    [[ADD_10:%.*]] = add nsw i32 undef, [[ADD_9]]
+; STORE-NEXT:    [[ADD_11:%.*]] = add nsw i32 undef, [[ADD_10]]
+; STORE-NEXT:    [[ADD_12:%.*]] = add nsw i32 undef, [[ADD_11]]
+; STORE-NEXT:    [[ADD_13:%.*]] = add nsw i32 undef, [[ADD_12]]
+; STORE-NEXT:    [[ADD_14:%.*]] = add nsw i32 undef, [[ADD_13]]
+; STORE-NEXT:    [[ADD_15:%.*]] = add nsw i32 undef, [[ADD_14]]
+; STORE-NEXT:    [[ADD_16:%.*]] = add nsw i32 undef, [[ADD_15]]
+; STORE-NEXT:    [[ADD_17:%.*]] = add nsw i32 undef, [[ADD_16]]
+; STORE-NEXT:    [[ADD_18:%.*]] = add nsw i32 undef, [[ADD_17]]
+; STORE-NEXT:    [[ADD_19:%.*]] = add nsw i32 undef, [[ADD_18]]
+; STORE-NEXT:    [[ADD_20:%.*]] = add nsw i32 undef, [[ADD_19]]
+; STORE-NEXT:    [[ADD_21:%.*]] = add nsw i32 undef, [[ADD_20]]
+; STORE-NEXT:    [[ADD_22:%.*]] = add nsw i32 undef, [[ADD_21]]
+; STORE-NEXT:    [[ADD_23:%.*]] = add nsw i32 undef, [[ADD_22]]
+; STORE-NEXT:    [[ADD_24:%.*]] = add nsw i32 undef, [[ADD_23]]
+; STORE-NEXT:    [[ADD_25:%.*]] = add nsw i32 undef, [[ADD_24]]
+; STORE-NEXT:    [[ADD_26:%.*]] = add nsw i32 undef, [[ADD_25]]
+; STORE-NEXT:    [[ADD_27:%.*]] = add nsw i32 undef, [[ADD_26]]
+; STORE-NEXT:    [[ADD_28:%.*]] = add nsw i32 undef, [[ADD_27]]
+; STORE-NEXT:    [[ADD_29:%.*]] = add nsw i32 undef, [[ADD_28]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <32 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX6:%.*]] = add nsw <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; STORE-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX8:%.*]] = add nsw <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
+; STORE-NEXT:    [[ADD_30:%.*]] = add nsw i32 undef, [[ADD_29]]
+; STORE-NEXT:    store i32 [[TMP1]], i32* [[RES:%.*]], align 16
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %8 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 8), align 16
+  %add.7 = add nsw i32 %8, %add.6
+  %9 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 9), align 4
+  %add.8 = add nsw i32 %9, %add.7
+  %10 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 10), align 8
+  %add.9 = add nsw i32 %10, %add.8
+  %11 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 11), align 4
+  %add.10 = add nsw i32 %11, %add.9
+  %12 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 12), align 16
+  %add.11 = add nsw i32 %12, %add.10
+  %13 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 13), align 4
+  %add.12 = add nsw i32 %13, %add.11
+  %14 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 14), align 8
+  %add.13 = add nsw i32 %14, %add.12
+  %15 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 15), align 4
+  %add.14 = add nsw i32 %15, %add.13
+  %16 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 16), align 16
+  %add.15 = add nsw i32 %16, %add.14
+  %17 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 17), align 4
+  %add.16 = add nsw i32 %17, %add.15
+  %18 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 18), align 8
+  %add.17 = add nsw i32 %18, %add.16
+  %19 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 19), align 4
+  %add.18 = add nsw i32 %19, %add.17
+  %20 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 20), align 16
+  %add.19 = add nsw i32 %20, %add.18
+  %21 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 21), align 4
+  %add.20 = add nsw i32 %21, %add.19
+  %22 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 22), align 8
+  %add.21 = add nsw i32 %22, %add.20
+  %23 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 23), align 4
+  %add.22 = add nsw i32 %23, %add.21
+  %24 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 24), align 16
+  %add.23 = add nsw i32 %24, %add.22
+  %25 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 25), align 4
+  %add.24 = add nsw i32 %25, %add.23
+  %26 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 26), align 8
+  %add.25 = add nsw i32 %26, %add.24
+  %27 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 27), align 4
+  %add.26 = add nsw i32 %27, %add.25
+  %28 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 28), align 16
+  %add.27 = add nsw i32 %28, %add.26
+  %29 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 29), align 4
+  %add.28 = add nsw i32 %29, %add.27
+  %30 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 30), align 8
+  %add.29 = add nsw i32 %30, %add.28
+  %31 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 31), align 4
+  %add.30 = add nsw i32 %31, %add.29
+  store i32 %add.30, i32* %res, align 16
+  ret void
+}
+
+declare i32 @foobar(i32)
+
+define void @i32_red_call(i32 %val) {
+; CHECK-LABEL: @i32_red_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @i32_red_call(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; STORE-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; STORE-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; STORE-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; STORE-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; STORE-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[TMP1]])
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %res = call i32 @foobar(i32 %add.6)
+  ret void
+}
+
+define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @i32_red_invoke(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
+; CHECK-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
+; CHECK:       exception:
+; CHECK-NEXT:    [[CLEANUP:%.*]] = landingpad i8
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    br label [[NORMAL]]
+; CHECK:       normal:
+; CHECK-NEXT:    ret void
+;
+; STORE-LABEL: @i32_red_invoke(
+; STORE-NEXT:  entry:
+; STORE-NEXT:    [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
+; STORE-NEXT:    [[ADD:%.*]] = add nsw i32 undef, undef
+; STORE-NEXT:    [[ADD_1:%.*]] = add nsw i32 undef, [[ADD]]
+; STORE-NEXT:    [[ADD_2:%.*]] = add nsw i32 undef, [[ADD_1]]
+; STORE-NEXT:    [[ADD_3:%.*]] = add nsw i32 undef, [[ADD_2]]
+; STORE-NEXT:    [[ADD_4:%.*]] = add nsw i32 undef, [[ADD_3]]
+; STORE-NEXT:    [[ADD_5:%.*]] = add nsw i32 undef, [[ADD_4]]
+; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; STORE-NEXT:    [[ADD_6:%.*]] = add nsw i32 undef, [[ADD_5]]
+; STORE-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[TMP1]])
+; STORE-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
+; STORE:       exception:
+; STORE-NEXT:    [[CLEANUP:%.*]] = landingpad i8
+; STORE-NEXT:    cleanup
+; STORE-NEXT:    br label [[NORMAL]]
+; STORE:       normal:
+; STORE-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
new file mode 100644
index 00000000000..d6e44aa1d6a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -0,0 +1,527 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; 128-bit vectors
+;
+
+define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b) {
+; SSE-LABEL: @test_v2f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <2 x double> [[TMP3]]
+;
+; SLM-LABEL: @test_v2f64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
+; SLM-NEXT:    [[R0:%.*]] = fsub double [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = fsub double [[B0]], [[B1]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x double> undef, double [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x double> [[R00]], double [[R1]], i32 1
+; SLM-NEXT:    ret <2 x double> [[R01]]
+;
+; AVX-LABEL: @test_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <2 x double> [[TMP3]]
+;
+; AVX512-LABEL: @test_v2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %r0 = fsub double %a0, %a1
+  %r1 = fsub double %b0, %b1
+  %r00 = insertelement <2 x double> undef, double %r0, i32 0
+  %r01 = insertelement <2 x double>  %r00, double %r1, i32 1
+  ret <2 x double> %r01
+}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @test_v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %r0 = fsub float %a0, %a1
+  %r1 = fsub float %a2, %a3
+  %r2 = fsub float %b0, %b1
+  %r3 = fsub float %b2, %b3
+  %r00 = insertelement <4 x float> undef, float %r0, i32 0
+  %r01 = insertelement <4 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <4 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <4 x float>  %r02, float %r3, i32 3
+  ret <4 x float> %r03
+}
+
+define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
+; SSE-LABEL: @test_v2i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    ret <2 x i64> [[TMP3]]
+;
+; SLM-LABEL: @test_v2i64(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
+; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
+; SLM-NEXT:    [[R0:%.*]] = sub i64 [[A0]], [[A1]]
+; SLM-NEXT:    [[R1:%.*]] = sub i64 [[B0]], [[B1]]
+; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
+; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[R01]]
+;
+; AVX-LABEL: @test_v2i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <2 x i64> [[TMP3]]
+;
+; AVX512-LABEL: @test_v2i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <2 x i64> [[TMP3]]
+;
+  %a0 = extractelement <2 x i64> %a, i32 0
+  %a1 = extractelement <2 x i64> %a, i32 1
+  %b0 = extractelement <2 x i64> %b, i32 0
+  %b1 = extractelement <2 x i64> %b, i32 1
+  %r0 = sub i64 %a0, %a1
+  %r1 = sub i64 %b0, %b1
+  %r00 = insertelement <2 x i64> undef, i64 %r0, i32 0
+  %r01 = insertelement <2 x i64>  %r00, i64 %r1, i32 1
+  ret <2 x i64> %r01
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @test_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %r0 = sub i32 %a0, %a1
+  %r1 = sub i32 %a2, %a3
+  %r2 = sub i32 %b0, %b1
+  %r3 = sub i32 %b2, %b3
+  %r00 = insertelement <4 x i32> undef, i32 %r0, i32 0
+  %r01 = insertelement <4 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <4 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <4 x i32>  %r02, i32 %r3, i32 3
+  ret <4 x i32> %r03
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @test_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %r0 = sub i16 %a0, %a1
+  %r1 = sub i16 %a2, %a3
+  %r2 = sub i16 %a4, %a5
+  %r3 = sub i16 %a6, %a7
+  %r4 = sub i16 %b0, %b1
+  %r5 = sub i16 %b2, %b3
+  %r6 = sub i16 %b4, %b5
+  %r7 = sub i16 %b6, %b7
+  %r00 = insertelement <8 x i16> undef, i16 %r0, i32 0
+  %r01 = insertelement <8 x i16>  %r00, i16 %r1, i32 1
+  %r02 = insertelement <8 x i16>  %r01, i16 %r2, i32 2
+  %r03 = insertelement <8 x i16>  %r02, i16 %r3, i32 3
+  %r04 = insertelement <8 x i16>  %r03, i16 %r4, i32 4
+  %r05 = insertelement <8 x i16>  %r04, i16 %r5, i32 5
+  %r06 = insertelement <8 x i16>  %r05, i16 %r6, i32 6
+  %r07 = insertelement <8 x i16>  %r06, i16 %r7, i32 7
+  ret <8 x i16> %r07
+}
+
+;
+; 256-bit vectors
+;
+
+define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[R03]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[R03]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX512-LABEL: @test_v4f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX512-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %r0 = fsub double %a0, %a1
+  %r1 = fsub double %b0, %b1
+  %r2 = fsub double %a2, %a3
+  %r3 = fsub double %b2, %b3
+  %r00 = insertelement <4 x double> undef, double %r0, i32 0
+  %r01 = insertelement <4 x double>  %r00, double %r1, i32 1
+  %r02 = insertelement <4 x double>  %r01, double %r2, i32 2
+  %r03 = insertelement <4 x double>  %r02, double %r3, i32 3
+  ret <4 x double> %r03
+}
+
+define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[R07]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[R07]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+; AVX512-LABEL: @test_v8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX512-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %r0 = fsub float %a0, %a1
+  %r1 = fsub float %a2, %a3
+  %r2 = fsub float %b0, %b1
+  %r3 = fsub float %b2, %b3
+  %r4 = fsub float %a4, %a5
+  %r5 = fsub float %a6, %a7
+  %r6 = fsub float %b4, %b5
+  %r7 = fsub float %b6, %b7
+  %r00 = insertelement <8 x float> undef, float %r0, i32 0
+  %r01 = insertelement <8 x float>  %r00, float %r1, i32 1
+  %r02 = insertelement <8 x float>  %r01, float %r2, i32 2
+  %r03 = insertelement <8 x float>  %r02, float %r3, i32 3
+  %r04 = insertelement <8 x float>  %r03, float %r4, i32 4
+  %r05 = insertelement <8 x float>  %r04, float %r5, i32 5
+  %r06 = insertelement <8 x float>  %r05, float %r6, i32 6
+  %r07 = insertelement <8 x float>  %r06, float %r7, i32 7
+  ret <8 x float> %r07
+}
+
+define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[R03]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[R03]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+;
+; AVX512-LABEL: @test_v4i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <4 x i64> [[TMP3]]
+;
+  %a0 = extractelement <4 x i64> %a, i32 0
+  %a1 = extractelement <4 x i64> %a, i32 1
+  %a2 = extractelement <4 x i64> %a, i32 2
+  %a3 = extractelement <4 x i64> %a, i32 3
+  %b0 = extractelement <4 x i64> %b, i32 0
+  %b1 = extractelement <4 x i64> %b, i32 1
+  %b2 = extractelement <4 x i64> %b, i32 2
+  %b3 = extractelement <4 x i64> %b, i32 3
+  %r0 = sub i64 %a0, %a1
+  %r1 = sub i64 %b0, %b1
+  %r2 = sub i64 %a2, %a3
+  %r3 = sub i64 %b2, %b3
+  %r00 = insertelement <4 x i64> undef, i64 %r0, i32 0
+  %r01 = insertelement <4 x i64>  %r00, i64 %r1, i32 1
+  %r02 = insertelement <4 x i64>  %r01, i64 %r2, i32 2
+  %r03 = insertelement <4 x i64>  %r02, i64 %r3, i32 3
+  ret <4 x i64> %r03
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[R07]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
+; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[R07]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+;
+; AVX512-LABEL: @test_v8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %r0 = sub i32 %a0, %a1
+  %r1 = sub i32 %a2, %a3
+  %r2 = sub i32 %b0, %b1
+  %r3 = sub i32 %b2, %b3
+  %r4 = sub i32 %a4, %a5
+  %r5 = sub i32 %a6, %a7
+  %r6 = sub i32 %b4, %b5
+  %r7 = sub i32 %b6, %b7
+  %r00 = insertelement <8 x i32> undef, i32 %r0, i32 0
+  %r01 = insertelement <8 x i32>  %r00, i32 %r1, i32 1
+  %r02 = insertelement <8 x i32>  %r01, i32 %r2, i32 2
+  %r03 = insertelement <8 x i32>  %r02, i32 %r3, i32 3
+  %r04 = insertelement <8 x i32>  %r03, i32 %r4, i32 4
+  %r05 = insertelement <8 x i32>  %r04, i32 %r5, i32 5
+  %r06 = insertelement <8 x i32>  %r05, i32 %r6, i32 6
+  %r07 = insertelement <8 x i32>  %r06, i32 %r7, i32 7
+  ret <8 x i32> %r07
+}
+
+define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[RV15]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+;
+; AVX512-LABEL: @test_v16i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX512-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    ret <16 x i16> [[TMP3]]
+;
+  %a0  = extractelement <16 x i16> %a, i32 0
+  %a1  = extractelement <16 x i16> %a, i32 1
+  %a2  = extractelement <16 x i16> %a, i32 2
+  %a3  = extractelement <16 x i16> %a, i32 3
+  %a4  = extractelement <16 x i16> %a, i32 4
+  %a5  = extractelement <16 x i16> %a, i32 5
+  %a6  = extractelement <16 x i16> %a, i32 6
+  %a7  = extractelement <16 x i16> %a, i32 7
+  %a8  = extractelement <16 x i16> %a, i32 8
+  %a9  = extractelement <16 x i16> %a, i32 9
+  %a10 = extractelement <16 x i16> %a, i32 10
+  %a11 = extractelement <16 x i16> %a, i32 11
+  %a12 = extractelement <16 x i16> %a, i32 12
+  %a13 = extractelement <16 x i16> %a, i32 13
+  %a14 = extractelement <16 x i16> %a, i32 14
+  %a15 = extractelement <16 x i16> %a, i32 15
+  %b0  = extractelement <16 x i16> %b, i32 0
+  %b1  = extractelement <16 x i16> %b, i32 1
+  %b2  = extractelement <16 x i16> %b, i32 2
+  %b3  = extractelement <16 x i16> %b, i32 3
+  %b4  = extractelement <16 x i16> %b, i32 4
+  %b5  = extractelement <16 x i16> %b, i32 5
+  %b6  = extractelement <16 x i16> %b, i32 6
+  %b7  = extractelement <16 x i16> %b, i32 7
+  %b8  = extractelement <16 x i16> %b, i32 8
+  %b9  = extractelement <16 x i16> %b, i32 9
+  %b10 = extractelement <16 x i16> %b, i32 10
+  %b11 = extractelement <16 x i16> %b, i32 11
+  %b12 = extractelement <16 x i16> %b, i32 12
+  %b13 = extractelement <16 x i16> %b, i32 13
+  %b14 = extractelement <16 x i16> %b, i32 14
+  %b15 = extractelement <16 x i16> %b, i32 15
+  %r0  = sub i16 %a0 , %a1
+  %r1  = sub i16 %a2 , %a3
+  %r2  = sub i16 %a4 , %a5
+  %r3  = sub i16 %a6 , %a7
+  %r4  = sub i16 %b0 , %b1
+  %r5  = sub i16 %b2 , %b3
+  %r6  = sub i16 %b4 , %b5
+  %r7  = sub i16 %b6 , %b7
+  %r8  = sub i16 %a8 , %a9
+  %r9  = sub i16 %a10, %a11
+  %r10 = sub i16 %a12, %a13
+  %r11 = sub i16 %a14, %a15
+  %r12 = sub i16 %b8 , %b9
+  %r13 = sub i16 %b10, %b11
+  %r14 = sub i16 %b12, %b13
+  %r15 = sub i16 %b14, %b15
+  %rv0  = insertelement <16 x i16> undef, i16 %r0 , i32 0
+  %rv1  = insertelement <16 x i16> %rv0 , i16 %r1 , i32 1
+  %rv2  = insertelement <16 x i16> %rv1 , i16 %r2 , i32 2
+  %rv3  = insertelement <16 x i16> %rv2 , i16 %r3 , i32 3
+  %rv4  = insertelement <16 x i16> %rv3 , i16 %r4 , i32 4
+  %rv5  = insertelement <16 x i16> %rv4 , i16 %r5 , i32 5
+  %rv6  = insertelement <16 x i16> %rv5 , i16 %r6 , i32 6
+  %rv7  = insertelement <16 x i16> %rv6 , i16 %r7 , i32 7
+  %rv8  = insertelement <16 x i16> %rv7 , i16 %r8 , i32 8
+  %rv9  = insertelement <16 x i16> %rv8 , i16 %r9 , i32 9
+  %rv10 = insertelement <16 x i16> %rv9 , i16 %r10, i32 10
+  %rv11 = insertelement <16 x i16> %rv10, i16 %r11, i32 11
+  %rv12 = insertelement <16 x i16> %rv11, i16 %r12, i32 12
+  %rv13 = insertelement <16 x i16> %rv12, i16 %r13, i32 13
+  %rv14 = insertelement <16 x i16> %rv13, i16 %r14, i32 14
+  %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
+  ret <16 x i16> %rv15
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/implicitfloat.ll b/llvm/test/Transforms/SLPVectorizer/X86/implicitfloat.ll
new file mode 100644
index 00000000000..2fbb8f0477f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/implicitfloat.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't vectorize when noimplicitfloat is used.
+define void @test1(double* %a, double* %b, double* %c) noimplicitfloat { ; <------ noimplicitfloat attribute here!
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/in-tree-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
new file mode 100644
index 00000000000..7e0cfb77c6b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@.str = private unnamed_addr constant [6 x i8] c"bingo\00", align 1
+
+; Uses inside the tree must be scheduled after the corresponding tree bundle.
+define void @in_tree_user(double* nocapture %A, i32 %n) {
+; CHECK-LABEL: @in_tree_user(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[N:%.*]] to double
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[CONV]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[CONV]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 7.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 5.000000e+00, double 9.000000e+00>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[INTREEUSER:%.*]] = fadd double [[TMP8]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[CMP11]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0))
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    store double [[INTREEUSER]], double* [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sitofp i32 %n to double
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds double, double* %A, i64 %0
+  %1 = load double, double* %arrayidx, align 8
+  %mul1 = fmul double %conv, %1
+  %mul2 = fmul double %mul1, 7.000000e+00
+  %add = fadd double %mul2, 5.000000e+00
+  %InTreeUser = fadd double %add, %add    ; <------------------ In tree user.
+  %2 = or i64 %0, 1
+  %arrayidx6 = getelementptr inbounds double, double* %A, i64 %2
+  %3 = load double, double* %arrayidx6, align 8
+  %mul8 = fmul double %conv, %3
+  %mul9 = fmul double %mul8, 4.000000e+00
+  %add10 = fadd double %mul9, 9.000000e+00
+  %cmp11 = fcmp ogt double %add, %add10
+  br i1 %cmp11, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str, i64 0, i64 0))
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc
+  store double %InTreeUser, double* %A, align 8   ; Avoid dead code elimination of the InTreeUser.
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...)
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
new file mode 100644
index 00000000000..2a4d457f106
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -0,0 +1,701 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define zeroext i8 @foo(i32 %x, i32 %y, i32 %a, i32 %b) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[B_A:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A:%.*]]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = trunc i32 [[B_A]] to i8
+; CHECK-NEXT:    ret i8 [[RETVAL_0]]
+;
+entry:
+  %cmp = icmp slt i32 %x, %y
+  %b.a = select i1 %cmp, i32 %b, i32 %a
+  %retval.0 = trunc i32 %b.a to i8
+  ret i8 %retval.0
+}
+
+define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i8* noalias nocapture readonly %d, i8* noalias nocapture %e, i32 %w) local_unnamed_addr #1 {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX47:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX56:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX57:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX59:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX68:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX69:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX71:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX73:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX95:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX97:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX100:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX104:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX105:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX107:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX109:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX112:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX116:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX117:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX119:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX121:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX143:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX145:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX148:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX152:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX153:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX155:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX157:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX160:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX164:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX165:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX167:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX169:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP17:%.*]] = load <16 x i8>, <16 x i8>* [[TMP16]], align 1
+; CHECK-NEXT:    [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i8>, <16 x i8>* [[TMP18]], align 1
+; CHECK-NEXT:    [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i8>, <16 x i8>* [[TMP20]], align 1
+; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP23:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]]
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
+; CHECK-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP28]], <16 x i8>* [[TMP29]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0356]], 1
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 16
+; CHECK-NEXT:    [[ADD_PTR189]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 16
+; CHECK-NEXT:    [[ADD_PTR190]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 16
+; CHECK-NEXT:    [[ADD_PTR191]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 16
+; CHECK-NEXT:    [[ADD_PTR192]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 16
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.0356 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.addr.0355 = phi i8* [ %a, %entry ], [ %add.ptr, %for.body ]
+  %e.addr.0354 = phi i8* [ %e, %entry ], [ %add.ptr192, %for.body ]
+  %d.addr.0353 = phi i8* [ %d, %entry ], [ %add.ptr191, %for.body ]
+  %c.addr.0352 = phi i8* [ %c, %entry ], [ %add.ptr190, %for.body ]
+  %b.addr.0351 = phi i8* [ %b, %entry ], [ %add.ptr189, %for.body ]
+  %0 = load i8, i8* %c.addr.0352, align 1
+  %1 = load i8, i8* %d.addr.0353, align 1
+  %2 = load i8, i8* %a.addr.0355, align 1
+  %3 = load i8, i8* %b.addr.0351, align 1
+  %cmp.i = icmp ult i8 %0, %1
+  %b.a.i.v.v = select i1 %cmp.i, i8 %3, i8 %2
+  %b.a.i.v = zext i8 %b.a.i.v.v to i32
+  %b.a.i = mul i32 %b.a.i.v, %w
+  %retval.0.i = trunc i32 %b.a.i to i8
+  store i8 %retval.0.i, i8* %e.addr.0354, align 1
+  %arrayidx9 = getelementptr inbounds i8, i8* %c.addr.0352, i64 1
+  %4 = load i8, i8* %arrayidx9, align 1
+  %arrayidx11 = getelementptr inbounds i8, i8* %d.addr.0353, i64 1
+  %5 = load i8, i8* %arrayidx11, align 1
+  %arrayidx13 = getelementptr inbounds i8, i8* %a.addr.0355, i64 1
+  %6 = load i8, i8* %arrayidx13, align 1
+  %arrayidx16 = getelementptr inbounds i8, i8* %b.addr.0351, i64 1
+  %7 = load i8, i8* %arrayidx16, align 1
+  %cmp.i348 = icmp ult i8 %4, %5
+  %b.a.i349.v.v = select i1 %cmp.i348, i8 %7, i8 %6
+  %b.a.i349.v = zext i8 %b.a.i349.v.v to i32
+  %b.a.i349 = mul i32 %b.a.i349.v, %w
+  %retval.0.i350 = trunc i32 %b.a.i349 to i8
+  %arrayidx20 = getelementptr inbounds i8, i8* %e.addr.0354, i64 1
+  store i8 %retval.0.i350, i8* %arrayidx20, align 1
+  %arrayidx21 = getelementptr inbounds i8, i8* %c.addr.0352, i64 2
+  %8 = load i8, i8* %arrayidx21, align 1
+  %arrayidx23 = getelementptr inbounds i8, i8* %d.addr.0353, i64 2
+  %9 = load i8, i8* %arrayidx23, align 1
+  %arrayidx25 = getelementptr inbounds i8, i8* %a.addr.0355, i64 2
+  %10 = load i8, i8* %arrayidx25, align 1
+  %arrayidx28 = getelementptr inbounds i8, i8* %b.addr.0351, i64 2
+  %11 = load i8, i8* %arrayidx28, align 1
+  %cmp.i345 = icmp ult i8 %8, %9
+  %b.a.i346.v.v = select i1 %cmp.i345, i8 %11, i8 %10
+  %b.a.i346.v = zext i8 %b.a.i346.v.v to i32
+  %b.a.i346 = mul i32 %b.a.i346.v, %w
+  %retval.0.i347 = trunc i32 %b.a.i346 to i8
+  %arrayidx32 = getelementptr inbounds i8, i8* %e.addr.0354, i64 2
+  store i8 %retval.0.i347, i8* %arrayidx32, align 1
+  %arrayidx33 = getelementptr inbounds i8, i8* %c.addr.0352, i64 3
+  %12 = load i8, i8* %arrayidx33, align 1
+  %arrayidx35 = getelementptr inbounds i8, i8* %d.addr.0353, i64 3
+  %13 = load i8, i8* %arrayidx35, align 1
+  %arrayidx37 = getelementptr inbounds i8, i8* %a.addr.0355, i64 3
+  %14 = load i8, i8* %arrayidx37, align 1
+  %arrayidx40 = getelementptr inbounds i8, i8* %b.addr.0351, i64 3
+  %15 = load i8, i8* %arrayidx40, align 1
+  %cmp.i342 = icmp ult i8 %12, %13
+  %b.a.i343.v.v = select i1 %cmp.i342, i8 %15, i8 %14
+  %b.a.i343.v = zext i8 %b.a.i343.v.v to i32
+  %b.a.i343 = mul i32 %b.a.i343.v, %w
+  %retval.0.i344 = trunc i32 %b.a.i343 to i8
+  %arrayidx44 = getelementptr inbounds i8, i8* %e.addr.0354, i64 3
+  store i8 %retval.0.i344, i8* %arrayidx44, align 1
+  %arrayidx45 = getelementptr inbounds i8, i8* %c.addr.0352, i64 4
+  %16 = load i8, i8* %arrayidx45, align 1
+  %arrayidx47 = getelementptr inbounds i8, i8* %d.addr.0353, i64 4
+  %17 = load i8, i8* %arrayidx47, align 1
+  %arrayidx49 = getelementptr inbounds i8, i8* %a.addr.0355, i64 4
+  %18 = load i8, i8* %arrayidx49, align 1
+  %arrayidx52 = getelementptr inbounds i8, i8* %b.addr.0351, i64 4
+  %19 = load i8, i8* %arrayidx52, align 1
+  %cmp.i339 = icmp ult i8 %16, %17
+  %b.a.i340.v.v = select i1 %cmp.i339, i8 %19, i8 %18
+  %b.a.i340.v = zext i8 %b.a.i340.v.v to i32
+  %b.a.i340 = mul i32 %b.a.i340.v, %w
+  %retval.0.i341 = trunc i32 %b.a.i340 to i8
+  %arrayidx56 = getelementptr inbounds i8, i8* %e.addr.0354, i64 4
+  store i8 %retval.0.i341, i8* %arrayidx56, align 1
+  %arrayidx57 = getelementptr inbounds i8, i8* %c.addr.0352, i64 5
+  %20 = load i8, i8* %arrayidx57, align 1
+  %arrayidx59 = getelementptr inbounds i8, i8* %d.addr.0353, i64 5
+  %21 = load i8, i8* %arrayidx59, align 1
+  %arrayidx61 = getelementptr inbounds i8, i8* %a.addr.0355, i64 5
+  %22 = load i8, i8* %arrayidx61, align 1
+  %arrayidx64 = getelementptr inbounds i8, i8* %b.addr.0351, i64 5
+  %23 = load i8, i8* %arrayidx64, align 1
+  %cmp.i336 = icmp ult i8 %20, %21
+  %b.a.i337.v.v = select i1 %cmp.i336, i8 %23, i8 %22
+  %b.a.i337.v = zext i8 %b.a.i337.v.v to i32
+  %b.a.i337 = mul i32 %b.a.i337.v, %w
+  %retval.0.i338 = trunc i32 %b.a.i337 to i8
+  %arrayidx68 = getelementptr inbounds i8, i8* %e.addr.0354, i64 5
+  store i8 %retval.0.i338, i8* %arrayidx68, align 1
+  %arrayidx69 = getelementptr inbounds i8, i8* %c.addr.0352, i64 6
+  %24 = load i8, i8* %arrayidx69, align 1
+  %arrayidx71 = getelementptr inbounds i8, i8* %d.addr.0353, i64 6
+  %25 = load i8, i8* %arrayidx71, align 1
+  %arrayidx73 = getelementptr inbounds i8, i8* %a.addr.0355, i64 6
+  %26 = load i8, i8* %arrayidx73, align 1
+  %arrayidx76 = getelementptr inbounds i8, i8* %b.addr.0351, i64 6
+  %27 = load i8, i8* %arrayidx76, align 1
+  %cmp.i333 = icmp ult i8 %24, %25
+  %b.a.i334.v.v = select i1 %cmp.i333, i8 %27, i8 %26
+  %b.a.i334.v = zext i8 %b.a.i334.v.v to i32
+  %b.a.i334 = mul i32 %b.a.i334.v, %w
+  %retval.0.i335 = trunc i32 %b.a.i334 to i8
+  %arrayidx80 = getelementptr inbounds i8, i8* %e.addr.0354, i64 6
+  store i8 %retval.0.i335, i8* %arrayidx80, align 1
+  %arrayidx81 = getelementptr inbounds i8, i8* %c.addr.0352, i64 7
+  %28 = load i8, i8* %arrayidx81, align 1
+  %arrayidx83 = getelementptr inbounds i8, i8* %d.addr.0353, i64 7
+  %29 = load i8, i8* %arrayidx83, align 1
+  %arrayidx85 = getelementptr inbounds i8, i8* %a.addr.0355, i64 7
+  %30 = load i8, i8* %arrayidx85, align 1
+  %arrayidx88 = getelementptr inbounds i8, i8* %b.addr.0351, i64 7
+  %31 = load i8, i8* %arrayidx88, align 1
+  %cmp.i330 = icmp ult i8 %28, %29
+  %b.a.i331.v.v = select i1 %cmp.i330, i8 %31, i8 %30
+  %b.a.i331.v = zext i8 %b.a.i331.v.v to i32
+  %b.a.i331 = mul i32 %b.a.i331.v, %w
+  %retval.0.i332 = trunc i32 %b.a.i331 to i8
+  %arrayidx92 = getelementptr inbounds i8, i8* %e.addr.0354, i64 7
+  store i8 %retval.0.i332, i8* %arrayidx92, align 1
+  %arrayidx93 = getelementptr inbounds i8, i8* %c.addr.0352, i64 8
+  %32 = load i8, i8* %arrayidx93, align 1
+  %arrayidx95 = getelementptr inbounds i8, i8* %d.addr.0353, i64 8
+  %33 = load i8, i8* %arrayidx95, align 1
+  %arrayidx97 = getelementptr inbounds i8, i8* %a.addr.0355, i64 8
+  %34 = load i8, i8* %arrayidx97, align 1
+  %arrayidx100 = getelementptr inbounds i8, i8* %b.addr.0351, i64 8
+  %35 = load i8, i8* %arrayidx100, align 1
+  %cmp.i327 = icmp ult i8 %32, %33
+  %b.a.i328.v.v = select i1 %cmp.i327, i8 %35, i8 %34
+  %b.a.i328.v = zext i8 %b.a.i328.v.v to i32
+  %b.a.i328 = mul i32 %b.a.i328.v, %w
+  %retval.0.i329 = trunc i32 %b.a.i328 to i8
+  %arrayidx104 = getelementptr inbounds i8, i8* %e.addr.0354, i64 8
+  store i8 %retval.0.i329, i8* %arrayidx104, align 1
+  %arrayidx105 = getelementptr inbounds i8, i8* %c.addr.0352, i64 9
+  %36 = load i8, i8* %arrayidx105, align 1
+  %arrayidx107 = getelementptr inbounds i8, i8* %d.addr.0353, i64 9
+  %37 = load i8, i8* %arrayidx107, align 1
+  %arrayidx109 = getelementptr inbounds i8, i8* %a.addr.0355, i64 9
+  %38 = load i8, i8* %arrayidx109, align 1
+  %arrayidx112 = getelementptr inbounds i8, i8* %b.addr.0351, i64 9
+  %39 = load i8, i8* %arrayidx112, align 1
+  %cmp.i324 = icmp ult i8 %36, %37
+  %b.a.i325.v.v = select i1 %cmp.i324, i8 %39, i8 %38
+  %b.a.i325.v = zext i8 %b.a.i325.v.v to i32
+  %b.a.i325 = mul i32 %b.a.i325.v, %w
+  %retval.0.i326 = trunc i32 %b.a.i325 to i8
+  %arrayidx116 = getelementptr inbounds i8, i8* %e.addr.0354, i64 9
+  store i8 %retval.0.i326, i8* %arrayidx116, align 1
+  %arrayidx117 = getelementptr inbounds i8, i8* %c.addr.0352, i64 10
+  %40 = load i8, i8* %arrayidx117, align 1
+  %arrayidx119 = getelementptr inbounds i8, i8* %d.addr.0353, i64 10
+  %41 = load i8, i8* %arrayidx119, align 1
+  %arrayidx121 = getelementptr inbounds i8, i8* %a.addr.0355, i64 10
+  %42 = load i8, i8* %arrayidx121, align 1
+  %arrayidx124 = getelementptr inbounds i8, i8* %b.addr.0351, i64 10
+  %43 = load i8, i8* %arrayidx124, align 1
+  %cmp.i321 = icmp ult i8 %40, %41
+  %b.a.i322.v.v = select i1 %cmp.i321, i8 %43, i8 %42
+  %b.a.i322.v = zext i8 %b.a.i322.v.v to i32
+  %b.a.i322 = mul i32 %b.a.i322.v, %w
+  %retval.0.i323 = trunc i32 %b.a.i322 to i8
+  %arrayidx128 = getelementptr inbounds i8, i8* %e.addr.0354, i64 10
+  store i8 %retval.0.i323, i8* %arrayidx128, align 1
+  %arrayidx129 = getelementptr inbounds i8, i8* %c.addr.0352, i64 11
+  %44 = load i8, i8* %arrayidx129, align 1
+  %arrayidx131 = getelementptr inbounds i8, i8* %d.addr.0353, i64 11
+  %45 = load i8, i8* %arrayidx131, align 1
+  %arrayidx133 = getelementptr inbounds i8, i8* %a.addr.0355, i64 11
+  %46 = load i8, i8* %arrayidx133, align 1
+  %arrayidx136 = getelementptr inbounds i8, i8* %b.addr.0351, i64 11
+  %47 = load i8, i8* %arrayidx136, align 1
+  %cmp.i318 = icmp ult i8 %44, %45
+  %b.a.i319.v.v = select i1 %cmp.i318, i8 %47, i8 %46
+  %b.a.i319.v = zext i8 %b.a.i319.v.v to i32
+  %b.a.i319 = mul i32 %b.a.i319.v, %w
+  %retval.0.i320 = trunc i32 %b.a.i319 to i8
+  %arrayidx140 = getelementptr inbounds i8, i8* %e.addr.0354, i64 11
+  store i8 %retval.0.i320, i8* %arrayidx140, align 1
+  %arrayidx141 = getelementptr inbounds i8, i8* %c.addr.0352, i64 12
+  %48 = load i8, i8* %arrayidx141, align 1
+  %arrayidx143 = getelementptr inbounds i8, i8* %d.addr.0353, i64 12
+  %49 = load i8, i8* %arrayidx143, align 1
+  %arrayidx145 = getelementptr inbounds i8, i8* %a.addr.0355, i64 12
+  %50 = load i8, i8* %arrayidx145, align 1
+  %arrayidx148 = getelementptr inbounds i8, i8* %b.addr.0351, i64 12
+  %51 = load i8, i8* %arrayidx148, align 1
+  %cmp.i315 = icmp ult i8 %48, %49
+  %b.a.i316.v.v = select i1 %cmp.i315, i8 %51, i8 %50
+  %b.a.i316.v = zext i8 %b.a.i316.v.v to i32
+  %b.a.i316 = mul i32 %b.a.i316.v, %w
+  %retval.0.i317 = trunc i32 %b.a.i316 to i8
+  %arrayidx152 = getelementptr inbounds i8, i8* %e.addr.0354, i64 12
+  store i8 %retval.0.i317, i8* %arrayidx152, align 1
+  %arrayidx153 = getelementptr inbounds i8, i8* %c.addr.0352, i64 13
+  %52 = load i8, i8* %arrayidx153, align 1
+  %arrayidx155 = getelementptr inbounds i8, i8* %d.addr.0353, i64 13
+  %53 = load i8, i8* %arrayidx155, align 1
+  %arrayidx157 = getelementptr inbounds i8, i8* %a.addr.0355, i64 13
+  %54 = load i8, i8* %arrayidx157, align 1
+  %arrayidx160 = getelementptr inbounds i8, i8* %b.addr.0351, i64 13
+  %55 = load i8, i8* %arrayidx160, align 1
+  %cmp.i312 = icmp ult i8 %52, %53
+  %b.a.i313.v.v = select i1 %cmp.i312, i8 %55, i8 %54
+  %b.a.i313.v = zext i8 %b.a.i313.v.v to i32
+  %b.a.i313 = mul i32 %b.a.i313.v, %w
+  %retval.0.i314 = trunc i32 %b.a.i313 to i8
+  %arrayidx164 = getelementptr inbounds i8, i8* %e.addr.0354, i64 13
+  store i8 %retval.0.i314, i8* %arrayidx164, align 1
+  %arrayidx165 = getelementptr inbounds i8, i8* %c.addr.0352, i64 14
+  %56 = load i8, i8* %arrayidx165, align 1
+  %arrayidx167 = getelementptr inbounds i8, i8* %d.addr.0353, i64 14
+  %57 = load i8, i8* %arrayidx167, align 1
+  %arrayidx169 = getelementptr inbounds i8, i8* %a.addr.0355, i64 14
+  %58 = load i8, i8* %arrayidx169, align 1
+  %arrayidx172 = getelementptr inbounds i8, i8* %b.addr.0351, i64 14
+  %59 = load i8, i8* %arrayidx172, align 1
+  %cmp.i309 = icmp ult i8 %56, %57
+  %b.a.i310.v.v = select i1 %cmp.i309, i8 %59, i8 %58
+  %b.a.i310.v = zext i8 %b.a.i310.v.v to i32
+  %b.a.i310 = mul i32 %b.a.i310.v, %w
+  %retval.0.i311 = trunc i32 %b.a.i310 to i8
+  %arrayidx176 = getelementptr inbounds i8, i8* %e.addr.0354, i64 14
+  store i8 %retval.0.i311, i8* %arrayidx176, align 1
+  %arrayidx177 = getelementptr inbounds i8, i8* %c.addr.0352, i64 15
+  %60 = load i8, i8* %arrayidx177, align 1
+  %arrayidx179 = getelementptr inbounds i8, i8* %d.addr.0353, i64 15
+  %61 = load i8, i8* %arrayidx179, align 1
+  %arrayidx181 = getelementptr inbounds i8, i8* %a.addr.0355, i64 15
+  %62 = load i8, i8* %arrayidx181, align 1
+  %arrayidx184 = getelementptr inbounds i8, i8* %b.addr.0351, i64 15
+  %63 = load i8, i8* %arrayidx184, align 1
+  %cmp.i306 = icmp ult i8 %60, %61
+  %b.a.i307.v.v = select i1 %cmp.i306, i8 %63, i8 %62
+  %b.a.i307.v = zext i8 %b.a.i307.v.v to i32
+  %b.a.i307 = mul i32 %b.a.i307.v, %w
+  %retval.0.i308 = trunc i32 %b.a.i307 to i8
+  %arrayidx188 = getelementptr inbounds i8, i8* %e.addr.0354, i64 15
+  store i8 %retval.0.i308, i8* %arrayidx188, align 1
+  %inc = add nuw nsw i32 %i.0356, 1
+  %add.ptr = getelementptr inbounds i8, i8* %a.addr.0355, i64 16
+  %add.ptr189 = getelementptr inbounds i8, i8* %b.addr.0351, i64 16
+  %add.ptr190 = getelementptr inbounds i8, i8* %c.addr.0352, i64 16
+  %add.ptr191 = getelementptr inbounds i8, i8* %d.addr.0353, i64 16
+  %add.ptr192 = getelementptr inbounds i8, i8* %e.addr.0354, i64 16
+  %exitcond = icmp eq i32 %inc, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+@ib = local_unnamed_addr global [64 x i32] [i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0], align 16
+@ia = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16
+
+define i32 @foo1() local_unnamed_addr #0 {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([64 x i32]* @ib to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> [[TMP0]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([64 x i32]* @ia to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> [[TMP10]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> [[TMP14]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[TMP16]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 32) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 36) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i32> [[TMP18]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 36) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 40) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i32> [[TMP20]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 40) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 44) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i32> [[TMP22]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP23]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 44) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 48) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i32> [[TMP24]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 48) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 52) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <4 x i32> [[TMP26]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP27]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 52) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 56) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <4 x i32> [[TMP28]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 56) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP30:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 60) to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <4 x i32> [[TMP30]], <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK-NEXT:    store <4 x i32> [[TMP31]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 60) to <4 x i32>*), align 16
+; CHECK-NEXT:    br label [[FOR_BODY5:%.*]]
+; CHECK:       for.cond3:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV:%.*]], 1
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp ult i64 [[INDVARS_IV]], 63
+; CHECK-NEXT:    br i1 [[CMP4]], label [[FOR_BODY5]], label [[FOR_END14:%.*]]
+; CHECK:       for.body5:
+; CHECK-NEXT:    [[INDVARS_IV]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT]], [[FOR_COND3:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [64 x i32], [64 x i32]* @ia, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [64 x i32], [64 x i32]* @ib, i64 0, i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[NEG10:%.*]] = xor i32 [[TMP33]], -1
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp eq i32 [[TMP32]], [[NEG10]]
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_COND3]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @abort()
+; CHECK-NEXT:    unreachable
+; CHECK:       for.end14:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 0), align 16
+  %neg = xor i32 %0, -1
+  store i32 %neg, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 1), align 4
+  %neg.1 = xor i32 %1, -1
+  store i32 %neg.1, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 1), align 4
+  %2 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 2), align 8
+  %neg.2 = xor i32 %2, -1
+  store i32 %neg.2, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 2), align 8
+  %3 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 3), align 4
+  %neg.3 = xor i32 %3, -1
+  store i32 %neg.3, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 3), align 4
+  %4 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4), align 16
+  %neg.4 = xor i32 %4, -1
+  store i32 %neg.4, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4), align 16
+  %5 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 5), align 4
+  %neg.5 = xor i32 %5, -1
+  store i32 %neg.5, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 5), align 4
+  %6 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 6), align 8
+  %neg.6 = xor i32 %6, -1
+  store i32 %neg.6, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 6), align 8
+  %7 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 7), align 4
+  %neg.7 = xor i32 %7, -1
+  store i32 %neg.7, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 7), align 4
+  %8 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8), align 16
+  %neg.8 = xor i32 %8, -1
+  store i32 %neg.8, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8), align 16
+  %9 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 9), align 4
+  %neg.9 = xor i32 %9, -1
+  store i32 %neg.9, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 9), align 4
+  %10 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 10), align 8
+  %neg.10 = xor i32 %10, -1
+  store i32 %neg.10, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 10), align 8
+  %11 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 11), align 4
+  %neg.11 = xor i32 %11, -1
+  store i32 %neg.11, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 11), align 4
+  %12 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12), align 16
+  %neg.12 = xor i32 %12, -1
+  store i32 %neg.12, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12), align 16
+  %13 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 13), align 4
+  %neg.13 = xor i32 %13, -1
+  store i32 %neg.13, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 13), align 4
+  %14 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 14), align 8
+  %neg.14 = xor i32 %14, -1
+  store i32 %neg.14, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 14), align 8
+  %15 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 15), align 4
+  %neg.15 = xor i32 %15, -1
+  store i32 %neg.15, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 15), align 4
+  %16 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16), align 16
+  %neg.16 = xor i32 %16, -1
+  store i32 %neg.16, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16), align 16
+  %17 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 17), align 4
+  %neg.17 = xor i32 %17, -1
+  store i32 %neg.17, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 17), align 4
+  %18 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 18), align 8
+  %neg.18 = xor i32 %18, -1
+  store i32 %neg.18, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 18), align 8
+  %19 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 19), align 4
+  %neg.19 = xor i32 %19, -1
+  store i32 %neg.19, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 19), align 4
+  %20 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20), align 16
+  %neg.20 = xor i32 %20, -1
+  store i32 %neg.20, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20), align 16
+  %21 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 21), align 4
+  %neg.21 = xor i32 %21, -1
+  store i32 %neg.21, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 21), align 4
+  %22 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 22), align 8
+  %neg.22 = xor i32 %22, -1
+  store i32 %neg.22, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 22), align 8
+  %23 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 23), align 4
+  %neg.23 = xor i32 %23, -1
+  store i32 %neg.23, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 23), align 4
+  %24 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24), align 16
+  %neg.24 = xor i32 %24, -1
+  store i32 %neg.24, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24), align 16
+  %25 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 25), align 4
+  %neg.25 = xor i32 %25, -1
+  store i32 %neg.25, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 25), align 4
+  %26 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 26), align 8
+  %neg.26 = xor i32 %26, -1
+  store i32 %neg.26, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 26), align 8
+  %27 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 27), align 4
+  %neg.27 = xor i32 %27, -1
+  store i32 %neg.27, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 27), align 4
+  %28 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28), align 16
+  %neg.28 = xor i32 %28, -1
+  store i32 %neg.28, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28), align 16
+  %29 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 29), align 4
+  %neg.29 = xor i32 %29, -1
+  store i32 %neg.29, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 29), align 4
+  %30 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 30), align 8
+  %neg.30 = xor i32 %30, -1
+  store i32 %neg.30, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 30), align 8
+  %31 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 31), align 4
+  %neg.31 = xor i32 %31, -1
+  store i32 %neg.31, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 31), align 4
+  %32 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32), align 16
+  %neg.32 = xor i32 %32, -1
+  store i32 %neg.32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 32), align 16
+  %33 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 33), align 4
+  %neg.33 = xor i32 %33, -1
+  store i32 %neg.33, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 33), align 4
+  %34 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 34), align 8
+  %neg.34 = xor i32 %34, -1
+  store i32 %neg.34, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 34), align 8
+  %35 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 35), align 4
+  %neg.35 = xor i32 %35, -1
+  store i32 %neg.35, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 35), align 4
+  %36 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 36), align 16
+  %neg.36 = xor i32 %36, -1
+  store i32 %neg.36, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 36), align 16
+  %37 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 37), align 4
+  %neg.37 = xor i32 %37, -1
+  store i32 %neg.37, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 37), align 4
+  %38 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 38), align 8
+  %neg.38 = xor i32 %38, -1
+  store i32 %neg.38, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 38), align 8
+  %39 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 39), align 4
+  %neg.39 = xor i32 %39, -1
+  store i32 %neg.39, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 39), align 4
+  %40 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 40), align 16
+  %neg.40 = xor i32 %40, -1
+  store i32 %neg.40, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 40), align 16
+  %41 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 41), align 4
+  %neg.41 = xor i32 %41, -1
+  store i32 %neg.41, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 41), align 4
+  %42 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 42), align 8
+  %neg.42 = xor i32 %42, -1
+  store i32 %neg.42, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 42), align 8
+  %43 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 43), align 4
+  %neg.43 = xor i32 %43, -1
+  store i32 %neg.43, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 43), align 4
+  %44 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 44), align 16
+  %neg.44 = xor i32 %44, -1
+  store i32 %neg.44, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 44), align 16
+  %45 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 45), align 4
+  %neg.45 = xor i32 %45, -1
+  store i32 %neg.45, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 45), align 4
+  %46 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 46), align 8
+  %neg.46 = xor i32 %46, -1
+  store i32 %neg.46, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 46), align 8
+  %47 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 47), align 4
+  %neg.47 = xor i32 %47, -1
+  store i32 %neg.47, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 47), align 4
+  %48 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 48), align 16
+  %neg.48 = xor i32 %48, -1
+  store i32 %neg.48, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 48), align 16
+  %49 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 49), align 4
+  %neg.49 = xor i32 %49, -1
+  store i32 %neg.49, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 49), align 4
+  %50 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 50), align 8
+  %neg.50 = xor i32 %50, -1
+  store i32 %neg.50, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 50), align 8
+  %51 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 51), align 4
+  %neg.51 = xor i32 %51, -1
+  store i32 %neg.51, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 51), align 4
+  %52 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 52), align 16
+  %neg.52 = xor i32 %52, -1
+  store i32 %neg.52, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 52), align 16
+  %53 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 53), align 4
+  %neg.53 = xor i32 %53, -1
+  store i32 %neg.53, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 53), align 4
+  %54 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 54), align 8
+  %neg.54 = xor i32 %54, -1
+  store i32 %neg.54, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 54), align 8
+  %55 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 55), align 4
+  %neg.55 = xor i32 %55, -1
+  store i32 %neg.55, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 55), align 4
+  %56 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 56), align 16
+  %neg.56 = xor i32 %56, -1
+  store i32 %neg.56, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 56), align 16
+  %57 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 57), align 4
+  %neg.57 = xor i32 %57, -1
+  store i32 %neg.57, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 57), align 4
+  %58 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 58), align 8
+  %neg.58 = xor i32 %58, -1
+  store i32 %neg.58, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 58), align 8
+  %59 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 59), align 4
+  %neg.59 = xor i32 %59, -1
+  store i32 %neg.59, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 59), align 4
+  %60 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 60), align 16
+  %neg.60 = xor i32 %60, -1
+  store i32 %neg.60, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 60), align 16
+  %61 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 61), align 4
+  %neg.61 = xor i32 %61, -1
+  store i32 %neg.61, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 61), align 4
+  %62 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 62), align 8
+  %neg.62 = xor i32 %62, -1
+  store i32 %neg.62, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 62), align 8
+  %63 = load i32, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 63), align 4
+  %neg.63 = xor i32 %63, -1
+  store i32 %neg.63, i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 63), align 4
+  br label %for.body5
+
+for.cond3:                                        ; preds = %for.body5
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp4 = icmp ult i64 %indvars.iv, 63
+  br i1 %cmp4, label %for.body5, label %for.end14
+
+for.body5:                                        ; preds = %entry, %for.cond3
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.cond3 ]
+  %arrayidx7 = getelementptr inbounds [64 x i32], [64 x i32]* @ia, i64 0, i64 %indvars.iv
+  %64 = load i32, i32* %arrayidx7, align 4
+  %arrayidx9 = getelementptr inbounds [64 x i32], [64 x i32]* @ib, i64 0, i64 %indvars.iv
+  %65 = load i32, i32* %arrayidx9, align 4
+  %neg10 = xor i32 %65, -1
+  %cmp11 = icmp eq i32 %64, %neg10
+  br i1 %cmp11, label %for.cond3, label %if.then
+
+if.then:                                          ; preds = %for.body5
+  tail call void @abort() #2
+  unreachable
+
+for.end14:                                        ; preds = %for.cond3
+  ret i32 0
+}
+
+declare void @abort() #2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
new file mode 100644
index 00000000000..5f6e8f143f4
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -0,0 +1,754 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-threshold=-10000 < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s | FileCheck %s --check-prefix=ZEROTHRESH
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
+
+target triple = "x86_64-apple-macosx10.8.0"
+
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+; ZEROTHRESH-LABEL: @simple_select(
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; ZEROTHRESH-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+declare void @llvm.assume(i1) nounwind
+
+; This entire tree is ephemeral, don't vectorize any of it.
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_eph(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; CHECK-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
+; CHECK-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
+; CHECK-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
+; CHECK-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
+; CHECK-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
+; CHECK-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
+; CHECK-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
+; CHECK-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
+; CHECK-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
+; CHECK-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
+; CHECK-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
+; CHECK-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[QI]])
+; CHECK-NEXT:    ret <4 x float> undef
+;
+; ZEROTHRESH-LABEL: @simple_select_eph(
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
+; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
+; ZEROTHRESH-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
+; ZEROTHRESH-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; ZEROTHRESH-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
+; ZEROTHRESH-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
+; ZEROTHRESH-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
+; ZEROTHRESH-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
+; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[S2]], i32 2
+; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
+; ZEROTHRESH-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
+; ZEROTHRESH-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
+; ZEROTHRESH-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
+; ZEROTHRESH-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
+; ZEROTHRESH-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
+; ZEROTHRESH-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
+; ZEROTHRESH-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
+; ZEROTHRESH-NEXT:    [[QI:%.*]] = fcmp olt float [[Q6]], [[Q5]]
+; ZEROTHRESH-NEXT:    call void @llvm.assume(i1 [[QI]])
+; ZEROTHRESH-NEXT:    ret <4 x float> undef
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  %q0 = extractelement <4 x float> %rd, i32 0
+  %q1 = extractelement <4 x float> %rd, i32 1
+  %q2 = extractelement <4 x float> %rd, i32 2
+  %q3 = extractelement <4 x float> %rd, i32 3
+  %q4 = fadd float %q0, %q1
+  %q5 = fadd float %q2, %q3
+  %q6 = fadd float %q4, %q5
+  %qi = fcmp olt float %q6, %q5
+  call void @llvm.assume(i1 %qi)
+  ret <4 x float> undef
+}
+
+; Insert in an order different from the vector indices to make sure it
+; doesn't matter
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_insert_out_of_order(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+; ZEROTHRESH-LABEL: @simple_select_insert_out_of_order(
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; ZEROTHRESH-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 2
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 0
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+declare void @v4f32_user(<4 x float>) #0
+declare void @f32_user(float) #0
+
+; Multiple users of the final constructed vector
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_users(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[RD]]) #0
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+; ZEROTHRESH-LABEL: @simple_select_users(
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP4]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> [[RB]], float [[TMP5]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP6]], i32 3
+; ZEROTHRESH-NEXT:    call void @v4f32_user(<4 x float> [[RD]]) #0
+; ZEROTHRESH-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  call void @v4f32_user(<4 x float> %rd) #0
+  ret <4 x float> %rd
+}
+
+; Unused insertelement
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_no_users(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1
+; CHECK-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RD]]
+;
+; ZEROTHRESH-LABEL: @simple_select_no_users(
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ZEROTHRESH-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT:    [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ZEROTHRESH-NEXT:    [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
+; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
+; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
+; ZEROTHRESH-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
+; ZEROTHRESH-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[C3]], 0
+; ZEROTHRESH-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
+; ZEROTHRESH-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
+; ZEROTHRESH-NEXT:    [[S2:%.*]] = select i1 [[CMP2]], float [[A2]], float [[B2]]
+; ZEROTHRESH-NEXT:    [[S3:%.*]] = select i1 [[CMP3]], float [[A3]], float [[B3]]
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[S0]], i32 0
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[S1]], i32 1
+; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x float> undef, float [[S2]], i32 2
+; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
+; ZEROTHRESH-NEXT:    ret <4 x float> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %cmp2 = icmp ne i32 %c2, 0
+  %cmp3 = icmp ne i32 %c3, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %s2 = select i1 %cmp2, float %a2, float %b2
+  %s3 = select i1 %cmp3, float %a3, float %b3
+  %ra = insertelement <4 x float> undef, float %s0, i32 0
+  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+  %rc = insertelement <4 x float> undef, float %s2, i32 2
+  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+  ret <4 x float> %rd
+}
+
+; Make sure infinite loop doesn't happen which I ran into when trying
+; to do this backwards this backwards
+define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+; CHECK-LABEL: @reconstruct(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
+; CHECK-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
+; CHECK-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[RD]]
+;
+; ZEROTHRESH-LABEL: @reconstruct(
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT:    [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
+; ZEROTHRESH-NEXT:    [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
+; ZEROTHRESH-NEXT:    [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
+; ZEROTHRESH-NEXT:    ret <4 x i32> [[RD]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %c2 = extractelement <4 x i32> %c, i32 2
+  %c3 = extractelement <4 x i32> %c, i32 3
+  %ra = insertelement <4 x i32> undef, i32 %c0, i32 0
+  %rb = insertelement <4 x i32> %ra, i32 %c1, i32 1
+  %rc = insertelement <4 x i32> %rb, i32 %c2, i32 2
+  %rd = insertelement <4 x i32> %rc, i32 %c3, i32 3
+  ret <4 x i32> %rd
+}
+
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_v2(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[TMP4]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RB]]
+;
+; ZEROTHRESH-LABEL: @simple_select_v2(
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
+; ZEROTHRESH-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
+; ZEROTHRESH-NEXT:    [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
+; ZEROTHRESH-NEXT:    [[S1:%.*]] = select i1 [[CMP1]], float [[A1]], float [[B1]]
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <2 x float> undef, float [[S0]], i32 0
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <2 x float> [[RA]], float [[S1]], i32 1
+; ZEROTHRESH-NEXT:    ret <2 x float> [[RB]]
+;
+  %c0 = extractelement <2 x i32> %c, i32 0
+  %c1 = extractelement <2 x i32> %c, i32 1
+  %a0 = extractelement <2 x float> %a, i32 0
+  %a1 = extractelement <2 x float> %a, i32 1
+  %b0 = extractelement <2 x float> %b, i32 0
+  %b1 = extractelement <2 x float> %b, i32 1
+  %cmp0 = icmp ne i32 %c0, 0
+  %cmp1 = icmp ne i32 %c1, 0
+  %s0 = select i1 %cmp0, float %a0, float %b0
+  %s1 = select i1 %cmp1, float %a1, float %b1
+  %ra = insertelement <2 x float> undef, float %s0, i32 0
+  %rb = insertelement <2 x float> %ra, float %s1, i32 1
+  ret <2 x float> %rb
+}
+
+; Make sure when we construct partial vectors, we don't keep
+; re-visiting the insertelement chains starting with undef
+; (low cost threshold needed to force this to happen)
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+; CHECK-LABEL: @simple_select_partial_vector(
+; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; CHECK-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; CHECK-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP10]], i32 1
+; CHECK-NEXT:    ret <4 x float> [[RB]]
+;
+; ZEROTHRESH-LABEL: @simple_select_partial_vector(
+; ZEROTHRESH-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT:    [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]]
+; ZEROTHRESH-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; ZEROTHRESH-NEXT:    [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP9]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; ZEROTHRESH-NEXT:    [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP10]], i32 1
+; ZEROTHRESH-NEXT:    ret <4 x float> [[RB]]
+;
+  %c0 = extractelement <4 x i32> %c, i32 0
+  %c1 = extractelement <4 x i32> %c, i32 1
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %1 = insertelement <2 x i32> undef, i32 %c0, i32 0
+  %2 = insertelement <2 x i32> %1, i32 %c1, i32 1
+  %3 = icmp ne <2 x i32> %2, zeroinitializer
+  %4 = insertelement <2 x float> undef, float %a0, i32 0
+  %5 = insertelement <2 x float> %4, float %a1, i32 1
+  %6 = insertelement <2 x float> undef, float %b0, i32 0
+  %7 = insertelement <2 x float> %6, float %b1, i32 1
+  %8 = select <2 x i1> %3, <2 x float> %5, <2 x float> %7
+  %9 = extractelement <2 x float> %8, i32 0
+  %ra = insertelement <4 x float> undef, float %9, i32 0
+  %10 = extractelement <2 x float> %8, i32 1
+  %rb = insertelement <4 x float> %ra, float %10, i32 1
+  ret <4 x float> %rb
+}
+
+; Make sure that vectorization happens even if insertelements operations
+; must be rescheduled. The case here is from compiling Julia.
+define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @reschedule_extract(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[V3]]
+;
+; ZEROTHRESH-LABEL: @reschedule_extract(
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; ZEROTHRESH-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; ZEROTHRESH-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; ZEROTHRESH-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; ZEROTHRESH-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
+; ZEROTHRESH-NEXT:    ret <4 x float> [[V3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %b0 = extractelement <4 x float> %b, i32 0
+  %c0 = fadd float %a0, %b0
+  %v0 = insertelement <4 x float> undef, float %c0, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b1 = extractelement <4 x float> %b, i32 1
+  %c1 = fadd float %a1, %b1
+  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %b2 = extractelement <4 x float> %b, i32 2
+  %c2 = fadd float %a2, %b2
+  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c3 = fadd float %a3, %b3
+  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
+  ret <4 x float> %v3
+}
+
+; Check that cost model for vectorization takes credit for
+; instructions that are erased.
+define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @take_credit(
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[V3]]
+;
+; ZEROTHRESH-LABEL: @take_credit(
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; ZEROTHRESH-NEXT:    [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; ZEROTHRESH-NEXT:    [[V1:%.*]] = insertelement <4 x float> [[V0]], float [[TMP3]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; ZEROTHRESH-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP4]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; ZEROTHRESH-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP5]], i32 3
+; ZEROTHRESH-NEXT:    ret <4 x float> [[V3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %b0 = extractelement <4 x float> %b, i32 0
+  %c0 = fadd float %a0, %b0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %b1 = extractelement <4 x float> %b, i32 1
+  %c1 = fadd float %a1, %b1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %b2 = extractelement <4 x float> %b, i32 2
+  %c2 = fadd float %a2, %b2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %b3 = extractelement <4 x float> %b, i32 3
+  %c3 = fadd float %a3, %b3
+  %v0 = insertelement <4 x float> undef, float %c0, i32 0
+  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
+  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
+  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
+  ret <4 x float> %v3
+}
+
+; Make sure we handle multiple trees that feed one build vector correctly.
+define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
+; CHECK-LABEL: @multi_tree(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double [[W:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[X:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
+; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
+; CHECK-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
+; CHECK-NEXT:    ret <4 x double> [[I4]]
+;
+; ZEROTHRESH-LABEL: @multi_tree(
+; ZEROTHRESH-NEXT:  entry:
+; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = insertelement <4 x double> undef, double [[W:%.*]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[X:%.*]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
+; ZEROTHRESH-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
+; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
+; ZEROTHRESH-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double [[TMP7]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP5]], i32 2
+; ZEROTHRESH-NEXT:    [[I3:%.*]] = insertelement <4 x double> [[I2]], double [[TMP8]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP5]], i32 3
+; ZEROTHRESH-NEXT:    [[I4:%.*]] = insertelement <4 x double> [[I3]], double [[TMP9]], i32 0
+; ZEROTHRESH-NEXT:    ret <4 x double> [[I4]]
+;
+entry:
+  %t0 = fadd double %w , 0.000000e+00
+  %t1 = fadd double %x , 1.000000e+00
+  %t2 = fadd double %y , 2.000000e+00
+  %t3 = fadd double %z , 3.000000e+00
+  %t4 = fmul double %t0, 1.000000e+00
+  %i1 = insertelement <4 x double> undef, double %t4, i32 3
+  %t5 = fmul double %t1, 1.000000e+00
+  %i2 = insertelement <4 x double> %i1, double %t5, i32 2
+  %t6 = fmul double %t2, 1.000000e+00
+  %i3 = insertelement <4 x double> %i2, double %t6, i32 1
+  %t7 = fmul double %t3, 1.000000e+00
+  %i4 = insertelement <4 x double> %i3, double %t7, i32 0
+  ret <4 x double> %i4
+}
+
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: @_vadd256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
+; CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
+; CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
+; CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
+; CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[VECINIT7_I]]
+;
+; ZEROTHRESH-LABEL: @_vadd256(
+; ZEROTHRESH-NEXT:  entry:
+; ZEROTHRESH-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
+; ZEROTHRESH-NEXT:    [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
+; ZEROTHRESH-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
+; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
+; ZEROTHRESH-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP2]], i32 1
+; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
+; ZEROTHRESH-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP3]], i32 2
+; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
+; ZEROTHRESH-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP4]], i32 3
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
+; ZEROTHRESH-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP5]], i32 4
+; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
+; ZEROTHRESH-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP6]], i32 5
+; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
+; ZEROTHRESH-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP7]], i32 6
+; ZEROTHRESH-NEXT:    [[TMP8:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
+; ZEROTHRESH-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP8]], i32 7
+; ZEROTHRESH-NEXT:    ret <8 x float> [[VECINIT7_I]]
+;
+  entry:
+  %vecext = extractelement <8 x float> %a, i32 0
+  %vecext1 = extractelement <8 x float> %b, i32 0
+  %add = fadd float %vecext, %vecext1
+  %vecext2 = extractelement <8 x float> %a, i32 1
+  %vecext3 = extractelement <8 x float> %b, i32 1
+  %add4 = fadd float %vecext2, %vecext3
+  %vecext5 = extractelement <8 x float> %a, i32 2
+  %vecext6 = extractelement <8 x float> %b, i32 2
+  %add7 = fadd float %vecext5, %vecext6
+  %vecext8 = extractelement <8 x float> %a, i32 3
+  %vecext9 = extractelement <8 x float> %b, i32 3
+  %add10 = fadd float %vecext8, %vecext9
+  %vecext11 = extractelement <8 x float> %a, i32 4
+  %vecext12 = extractelement <8 x float> %b, i32 4
+  %add13 = fadd float %vecext11, %vecext12
+  %vecext14 = extractelement <8 x float> %a, i32 5
+  %vecext15 = extractelement <8 x float> %b, i32 5
+  %add16 = fadd float %vecext14, %vecext15
+  %vecext17 = extractelement <8 x float> %a, i32 6
+  %vecext18 = extractelement <8 x float> %b, i32 6
+  %add19 = fadd float %vecext17, %vecext18
+  %vecext20 = extractelement <8 x float> %a, i32 7
+  %vecext21 = extractelement <8 x float> %b, i32 7
+  %add22 = fadd float %vecext20, %vecext21
+  %vecinit.i = insertelement <8 x float> undef, float %add, i32 0
+  %vecinit1.i = insertelement <8 x float> %vecinit.i, float %add4, i32 1
+  %vecinit2.i = insertelement <8 x float> %vecinit1.i, float %add7, i32 2
+  %vecinit3.i = insertelement <8 x float> %vecinit2.i, float %add10, i32 3
+  %vecinit4.i = insertelement <8 x float> %vecinit3.i, float %add13, i32 4
+  %vecinit5.i = insertelement <8 x float> %vecinit4.i, float %add16, i32 5
+  %vecinit6.i = insertelement <8 x float> %vecinit5.i, float %add19, i32 6
+  %vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
+  ret <8 x float> %vecinit7.i
+}
+
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
new file mode 100644
index 00000000000..1af11609fe6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll
@@ -0,0 +1,307 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+
+define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
+; CHECK-LABEL: @julia_2xdouble(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
+; CHECK-NEXT:    store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+top:
+  %px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
+  %x0 = load double, double* %px0, align 4
+  %py0 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 0
+  %y0 = load double, double* %py0, align 4
+  %m0 = fmul double %x0, %y0
+  %px1 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 1
+  %x1 = load double, double* %px1, align 4
+  %py1 = getelementptr inbounds [2 x double], [2 x double]* %3, i64 0, i64 1
+  %y1 = load double, double* %py1, align 4
+  %m1 = fmul double %x1, %y1
+  %pz0 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 0
+  %z0 = load double, double* %pz0, align 4
+  %a0 = fadd double %m0, %z0
+  %i0 = insertvalue [2 x double] undef, double %a0, 0
+  %pz1 = getelementptr inbounds [2 x double], [2 x double]* %1, i64 0, i64 1
+  %z1 = load double, double* %pz1, align 4
+  %a1 = fadd double %m1, %z1
+  %i1 = insertvalue [2 x double] %i0, double %a1, 1
+  store [2 x double] %i1, [2 x double]* %0, align 4
+  ret void
+}
+
+define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
+; CHECK-LABEL: @julia_4xfloat(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[PX0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PY0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PX1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 1
+; CHECK-NEXT:    [[PY1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 1
+; CHECK-NEXT:    [[PX2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 2
+; CHECK-NEXT:    [[PY2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 2
+; CHECK-NEXT:    [[PX3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT:    [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2
+; CHECK-NEXT:    [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
+; CHECK-NEXT:    [[I0:%.*]] = insertvalue [4 x float] undef, float [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
+; CHECK-NEXT:    [[I1:%.*]] = insertvalue [4 x float] [[I0]], float [[TMP13]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
+; CHECK-NEXT:    [[I2:%.*]] = insertvalue [4 x float] [[I1]], float [[TMP14]], 2
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
+; CHECK-NEXT:    [[I3:%.*]] = insertvalue [4 x float] [[I2]], float [[TMP15]], 3
+; CHECK-NEXT:    store [4 x float] [[I3]], [4 x float]* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+top:
+  %px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
+  %x0 = load float, float* %px0, align 4
+  %py0 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 0
+  %y0 = load float, float* %py0, align 4
+  %m0 = fmul float %x0, %y0
+  %px1 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 1
+  %x1 = load float, float* %px1, align 4
+  %py1 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 1
+  %y1 = load float, float* %py1, align 4
+  %m1 = fmul float %x1, %y1
+  %px2 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 2
+  %x2 = load float, float* %px2, align 4
+  %py2 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 2
+  %y2 = load float, float* %py2, align 4
+  %m2 = fmul float %x2, %y2
+  %px3 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 3
+  %x3 = load float, float* %px3, align 4
+  %py3 = getelementptr inbounds [4 x float], [4 x float]* %3, i64 0, i64 3
+  %y3 = load float, float* %py3, align 4
+  %m3 = fmul float %x3, %y3
+  %pz0 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 0
+  %z0 = load float, float* %pz0, align 4
+  %a0 = fadd float %m0, %z0
+  %i0 = insertvalue [4 x float] undef, float %a0, 0
+  %pz1 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 1
+  %z1 = load float, float* %pz1, align 4
+  %a1 = fadd float %m1, %z1
+  %i1 = insertvalue [4 x float] %i0, float %a1, 1
+  %pz2 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 2
+  %z2 = load float, float* %pz2, align 4
+  %a2 = fadd float %m2, %z2
+  %i2 = insertvalue [4 x float] %i1, float %a2, 2
+  %pz3 = getelementptr inbounds [4 x float], [4 x float]* %1, i64 0, i64 3
+  %z3 = load float, float* %pz3, align 4
+  %a3 = fadd float %m3, %z3
+  %i3 = insertvalue [4 x float] %i2, float %a3, 3
+  store [4 x float] %i3, [4 x float]* %0, align 4
+  ret void
+}
+
+define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
+; CHECK-LABEL: @julia_load_array_of_float(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x float]* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x float], [4 x float]* [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x float]* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x float], [4 x float]* [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x float] undef, float [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x float] [[C_ARR0]], float [[TMP6]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x float] [[C_ARR1]], float [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x float] [[C_ARR2]], float [[TMP8]], 3
+; CHECK-NEXT:    store [4 x float] [[C_ARR3]], [4 x float]* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+top:
+  %a_arr = load [4 x float], [4 x float]* %a, align 4
+  %a0 = extractvalue [4 x float] %a_arr, 0
+  %a2 = extractvalue [4 x float] %a_arr, 2
+  %a1 = extractvalue [4 x float] %a_arr, 1
+  %b_arr = load [4 x float], [4 x float]* %b, align 4
+  %b0 = extractvalue [4 x float] %b_arr, 0
+  %b2 = extractvalue [4 x float] %b_arr, 2
+  %b1 = extractvalue [4 x float] %b_arr, 1
+  %a3 = extractvalue [4 x float] %a_arr, 3
+  %c1 = fsub float %a1, %b1
+  %b3 = extractvalue [4 x float] %b_arr, 3
+  %c0 = fsub float %a0, %b0
+  %c2 = fsub float %a2, %b2
+  %c_arr0 = insertvalue [4 x float] undef, float %c0, 0
+  %c_arr1 = insertvalue [4 x float] %c_arr0, float %c1, 1
+  %c3 = fsub float %a3, %b3
+  %c_arr2 = insertvalue [4 x float] %c_arr1, float %c2, 2
+  %c_arr3 = insertvalue [4 x float] %c_arr2, float %c3, 3
+  store [4 x float] %c_arr3, [4 x float]* %c, align 4
+  ret void
+}
+
+define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
+; CHECK-LABEL: @julia_load_array_of_i32(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x i32]* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x i32], [4 x i32]* [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [4 x i32]* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x i32], [4 x i32]* [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i32] undef, i32 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i32] [[C_ARR0]], i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i32] [[C_ARR1]], i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i32] [[C_ARR2]], i32 [[TMP8]], 3
+; CHECK-NEXT:    store [4 x i32] [[C_ARR3]], [4 x i32]* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+top:
+  %a_arr = load [4 x i32], [4 x i32]* %a, align 4
+  %a0 = extractvalue [4 x i32] %a_arr, 0
+  %a2 = extractvalue [4 x i32] %a_arr, 2
+  %a1 = extractvalue [4 x i32] %a_arr, 1
+  %b_arr = load [4 x i32], [4 x i32]* %b, align 4
+  %b0 = extractvalue [4 x i32] %b_arr, 0
+  %b2 = extractvalue [4 x i32] %b_arr, 2
+  %b1 = extractvalue [4 x i32] %b_arr, 1
+  %a3 = extractvalue [4 x i32] %a_arr, 3
+  %c1 = sub i32 %a1, %b1
+  %b3 = extractvalue [4 x i32] %b_arr, 3
+  %c0 = sub i32 %a0, %b0
+  %c2 = sub i32 %a2, %b2
+  %c_arr0 = insertvalue [4 x i32] undef, i32 %c0, 0
+  %c_arr1 = insertvalue [4 x i32] %c_arr0, i32 %c1, 1
+  %c3 = sub i32 %a3, %b3
+  %c_arr2 = insertvalue [4 x i32] %c_arr1, i32 %c2, 2
+  %c_arr3 = insertvalue [4 x i32] %c_arr2, i32 %c3, 3
+  store [4 x i32] %c_arr3, [4 x i32]* %c, align 4
+  ret void
+}
+
+; Almost identical to previous test, but for type that should NOT be vectorized.
+;
+define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
+; CHECK-LABEL: @julia_load_array_of_i16(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A:%.*]], align 4
+; CHECK-NEXT:    [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0
+; CHECK-NEXT:    [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2
+; CHECK-NEXT:    [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1
+; CHECK-NEXT:    [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B:%.*]], align 4
+; CHECK-NEXT:    [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0
+; CHECK-NEXT:    [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2
+; CHECK-NEXT:    [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1
+; CHECK-NEXT:    [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3
+; CHECK-NEXT:    [[C1:%.*]] = sub i16 [[A1]], [[B1]]
+; CHECK-NEXT:    [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3
+; CHECK-NEXT:    [[C0:%.*]] = sub i16 [[A0]], [[B0]]
+; CHECK-NEXT:    [[C2:%.*]] = sub i16 [[A2]], [[B2]]
+; CHECK-NEXT:    [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0
+; CHECK-NEXT:    [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1
+; CHECK-NEXT:    [[C3:%.*]] = sub i16 [[A3]], [[B3]]
+; CHECK-NEXT:    [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2
+; CHECK-NEXT:    [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3
+; CHECK-NEXT:    store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+top:
+  %a_arr = load [4 x i16], [4 x i16]* %a, align 4
+  %a0 = extractvalue [4 x i16] %a_arr, 0
+  %a2 = extractvalue [4 x i16] %a_arr, 2
+  %a1 = extractvalue [4 x i16] %a_arr, 1
+  %b_arr = load [4 x i16], [4 x i16]* %b, align 4
+  %b0 = extractvalue [4 x i16] %b_arr, 0
+  %b2 = extractvalue [4 x i16] %b_arr, 2
+  %b1 = extractvalue [4 x i16] %b_arr, 1
+  %a3 = extractvalue [4 x i16] %a_arr, 3
+  %c1 = sub i16 %a1, %b1
+  %b3 = extractvalue [4 x i16] %b_arr, 3
+  %c0 = sub i16 %a0, %b0
+  %c2 = sub i16 %a2, %b2
+  %c_arr0 = insertvalue [4 x i16] undef, i16 %c0, 0
+  %c_arr1 = insertvalue [4 x i16] %c_arr0, i16 %c1, 1
+  %c3 = sub i16 %a3, %b3
+  %c_arr2 = insertvalue [4 x i16] %c_arr1, i16 %c2, 2
+  %c_arr3 = insertvalue [4 x i16] %c_arr2, i16 %c3, 3
+  store [4 x i16] %c_arr3, [4 x i16]* %c, align 4
+  ret void
+}
+
+%pseudovec = type { float, float, float, float }
+
+define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
+; CHECK-LABEL: @julia_load_struct_of_float(
+; CHECK-NEXT:  top:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %pseudovec* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[A_STRUCT:%.*]] = load [[PSEUDOVEC:%.*]], %pseudovec* [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast %pseudovec* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[B_STRUCT:%.*]] = load [[PSEUDOVEC]], %pseudovec* [[B]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC]] undef, float [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT:    [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct0, float [[TMP6]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT:    [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct1, float [[TMP7]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT:    [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct2, float [[TMP8]], 3
+; CHECK-NEXT:    store [[PSEUDOVEC]] %c_struct3, %pseudovec* [[C:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+top:
+  %a_struct = load %pseudovec, %pseudovec* %a, align 4
+  %a0 = extractvalue %pseudovec %a_struct, 0
+  %a1 = extractvalue %pseudovec %a_struct, 1
+  %b_struct = load %pseudovec, %pseudovec* %b, align 4
+  %a2 = extractvalue %pseudovec %a_struct, 2
+  %b0 = extractvalue %pseudovec %b_struct, 0
+  %a3 = extractvalue %pseudovec %a_struct, 3
+  %c0 = fsub float %a0, %b0
+  %b1 = extractvalue %pseudovec %b_struct, 1
+  %b2 = extractvalue %pseudovec %b_struct, 2
+  %c1 = fsub float %a1, %b1
+  %c_struct0 = insertvalue %pseudovec undef, float %c0, 0
+  %b3 = extractvalue %pseudovec %b_struct, 3
+  %c3 = fsub float %a3, %b3
+  %c_struct1 = insertvalue %pseudovec %c_struct0, float %c1, 1
+  %c2 = fsub float %a2, %b2
+  %c_struct2 = insertvalue %pseudovec %c_struct1, float %c2, 2
+  %c_struct3 = insertvalue %pseudovec %c_struct2, float %c3, 3
+  store %pseudovec %c_struct3, %pseudovec* %c, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
new file mode 100644
index 00000000000..fbce7551b00
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare double @llvm.fabs.f64(double) nounwind readnone
+
+define void @vec_fabs_f64(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @vec_fabs_f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %call = tail call double @llvm.fabs.f64(double %mul) nounwind readnone
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %call5 = tail call double @llvm.fabs.f64(double %mul5) nounwind readnone
+  store double %call, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %call5, double* %arrayidx5, align 8
+  ret void
+}
+
+declare float @llvm.copysign.f32(float, float) nounwind readnone
+
+define void @vec_copysign_f32(float* %a, float* %b, float* noalias %c) {
+; CHECK-LABEL: @vec_copysign_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load float, float* %a, align 4
+  %1 = load float, float* %b, align 4
+  %call0 = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
+  store float %call0, float* %c, align 4
+
+  %ix2 = getelementptr inbounds float, float* %a, i64 1
+  %2 = load float, float* %ix2, align 4
+  %ix3 = getelementptr inbounds float, float* %b, i64 1
+  %3 = load float, float* %ix3, align 4
+  %call1 = tail call float @llvm.copysign.f32(float %2, float %3) nounwind readnone
+  %c1 = getelementptr inbounds float, float* %c, i64 1
+  store float %call1, float* %c1, align 4
+
+  %ix4 = getelementptr inbounds float, float* %a, i64 2
+  %4 = load float, float* %ix4, align 4
+  %ix5 = getelementptr inbounds float, float* %b, i64 2
+  %5 = load float, float* %ix5, align 4
+  %call2 = tail call float @llvm.copysign.f32(float %4, float %5) nounwind readnone
+  %c2 = getelementptr inbounds float, float* %c, i64 2
+  store float %call2, float* %c2, align 4
+
+  %ix6 = getelementptr inbounds float, float* %a, i64 3
+  %6 = load float, float* %ix6, align 4
+  %ix7 = getelementptr inbounds float, float* %b, i64 3
+  %7 = load float, float* %ix7, align 4
+  %call3 = tail call float @llvm.copysign.f32(float %6, float %7) nounwind readnone
+  %c3 = getelementptr inbounds float, float* %c, i64 3
+  store float %call3, float* %c3, align 4
+
+  ret void
+}
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+
+define void @vec_bswap_i32(i32* %a, i32* %b, i32* %c) {
+; CHECK-LABEL: @vec_bswap_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[C:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.bswap.i32(i32 %add1) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
+  %i2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
+  %i3 = load i32, i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.bswap.i32(i32 %add2) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
+  %i4 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
+  %i5 = load i32, i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.bswap.i32(i32 %add3) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
+  %i6 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
+  %i7 = load i32, i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.bswap.i32(i32 %add4) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+}
+
+declare i32 @llvm.ctlz.i32(i32,i1) nounwind readnone
+
+define void @vec_ctlz_i32(i32* %a, i32* %b, i32* %c, i1) {
+; CHECK-LABEL: @vec_ctlz_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[C:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
+  %i2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
+  %i3 = load i32, i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 true) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
+  %i4 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
+  %i5 = load i32, i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
+  %i6 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
+  %i7 = load i32, i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 true) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+}
+
+define void @vec_ctlz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
+; CHECK-LABEL: @vec_ctlz_i32_neg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #3
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1
+; CHECK-NEXT:    [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #3
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
+; CHECK-NEXT:    [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
+; CHECK-NEXT:    [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #3
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
+; CHECK-NEXT:    [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
+; CHECK-NEXT:    [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #3
+; CHECK-NEXT:    store i32 [[CALL1]], i32* [[C:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1
+; CHECK-NEXT:    store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 2
+; CHECK-NEXT:    store i32 [[CALL3]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 3
+; CHECK-NEXT:    store i32 [[CALL4]], i32* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.ctlz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
+  %i2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
+  %i3 = load i32, i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.ctlz.i32(i32 %add2,i1 false) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
+  %i4 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
+  %i5 = load i32, i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.ctlz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
+  %i6 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
+  %i7 = load i32, i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.ctlz.i32(i32 %add4,i1 false) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+
+}
+
+
+declare i32 @llvm.cttz.i32(i32,i1) nounwind readnone
+
+define void @vec_cttz_i32(i32* %a, i32* %b, i32* %c, i1) {
+; CHECK-LABEL: @vec_cttz_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP5]], i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[C:%.*]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
+  %i2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
+  %i3 = load i32, i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 true) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
+  %i4 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
+  %i5 = load i32, i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
+  %i6 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
+  %i7 = load i32, i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 true) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+}
+
+define void @vec_cttz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
+; CHECK-LABEL: @vec_cttz_i32_neg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #3
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1
+; CHECK-NEXT:    [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #3
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
+; CHECK-NEXT:    [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
+; CHECK-NEXT:    [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #3
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
+; CHECK-NEXT:    [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
+; CHECK-NEXT:    [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #3
+; CHECK-NEXT:    store i32 [[CALL1]], i32* [[C:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1
+; CHECK-NEXT:    store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 2
+; CHECK-NEXT:    store i32 [[CALL3]], i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 3
+; CHECK-NEXT:    store i32 [[CALL4]], i32* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load i32, i32* %a, align 4
+  %i1 = load i32, i32* %b, align 4
+  %add1 = add i32 %i0, %i1
+  %call1 = tail call i32 @llvm.cttz.i32(i32 %add1,i1 true) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 1
+  %i2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i32 1
+  %i3 = load i32, i32* %arrayidx3, align 4
+  %add2 = add i32 %i2, %i3
+  %call2 = tail call i32 @llvm.cttz.i32(i32 %add2,i1 false) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds i32, i32* %a, i32 2
+  %i4 = load i32, i32* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %b, i32 2
+  %i5 = load i32, i32* %arrayidx5, align 4
+  %add3 = add i32 %i4, %i5
+  %call3 = tail call i32 @llvm.cttz.i32(i32 %add3,i1 true) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds i32, i32* %a, i32 3
+  %i6 = load i32, i32* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %b, i32 3
+  %i7 = load i32, i32* %arrayidx7, align 4
+  %add4 = add i32 %i6, %i7
+  %call4 = tail call i32 @llvm.cttz.i32(i32 %add4,i1 false) nounwind readnone
+
+  store i32 %call1, i32* %c, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %c, i32 1
+  store i32 %call2, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i32 2
+  store i32 %call3, i32* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %c, i32 3
+  store i32 %call4, i32* %arrayidx10, align 4
+  ret void
+
+}
+
+
+declare float @llvm.powi.f32(float, i32)
+define void @vec_powi_f32(float* %a, float* %b, float* %c, i32 %P) {
+; CHECK-LABEL: @vec_powi_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.powi.v4f32(<4 x float> [[TMP4]], i32 [[P:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load float, float* %a, align 4
+  %i1 = load float, float* %b, align 4
+  %add1 = fadd float %i0, %i1
+  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds float, float* %a, i32 1
+  %i2 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %b, i32 1
+  %i3 = load float, float* %arrayidx3, align 4
+  %add2 = fadd float %i2, %i3
+  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %P) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds float, float* %a, i32 2
+  %i4 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %b, i32 2
+  %i5 = load float, float* %arrayidx5, align 4
+  %add3 = fadd float %i4, %i5
+  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds float, float* %a, i32 3
+  %i6 = load float, float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float, float* %b, i32 3
+  %i7 = load float, float* %arrayidx7, align 4
+  %add4 = fadd float %i6, %i7
+  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %P) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float, float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float, float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float, float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+}
+
+
+define void @vec_powi_f32_neg(float* %a, float* %b, float* %c, i32 %P, i32 %Q) {
+; CHECK-LABEL: @vec_powi_f32_neg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I0:%.*]] = load float, float* [[A:%.*]], align 4
+; CHECK-NEXT:    [[I1:%.*]] = load float, float* [[B:%.*]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[I0]], [[I1]]
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @llvm.powi.f32(float [[ADD1]], i32 [[P:%.*]]) #3
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 1
+; CHECK-NEXT:    [[I2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i32 1
+; CHECK-NEXT:    [[I3:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[I2]], [[I3]]
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call float @llvm.powi.f32(float [[ADD2]], i32 [[Q:%.*]]) #3
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2
+; CHECK-NEXT:    [[I4:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[B]], i32 2
+; CHECK-NEXT:    [[I5:%.*]] = load float, float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[I4]], [[I5]]
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call float @llvm.powi.f32(float [[ADD3]], i32 [[P]]) #3
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3
+; CHECK-NEXT:    [[I6:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i32 3
+; CHECK-NEXT:    [[I7:%.*]] = load float, float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd float [[I6]], [[I7]]
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call float @llvm.powi.f32(float [[ADD4]], i32 [[Q]]) #3
+; CHECK-NEXT:    store float [[CALL1]], float* [[C:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1
+; CHECK-NEXT:    store float [[CALL2]], float* [[ARRAYIDX8]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2
+; CHECK-NEXT:    store float [[CALL3]], float* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3
+; CHECK-NEXT:    store float [[CALL4]], float* [[ARRAYIDX10]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load float, float* %a, align 4
+  %i1 = load float, float* %b, align 4
+  %add1 = fadd float %i0, %i1
+  %call1 = tail call float @llvm.powi.f32(float %add1,i32 %P) nounwind readnone
+
+  %arrayidx2 = getelementptr inbounds float, float* %a, i32 1
+  %i2 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %b, i32 1
+  %i3 = load float, float* %arrayidx3, align 4
+  %add2 = fadd float %i2, %i3
+  %call2 = tail call float @llvm.powi.f32(float %add2,i32 %Q) nounwind readnone
+
+  %arrayidx4 = getelementptr inbounds float, float* %a, i32 2
+  %i4 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %b, i32 2
+  %i5 = load float, float* %arrayidx5, align 4
+  %add3 = fadd float %i4, %i5
+  %call3 = tail call float @llvm.powi.f32(float %add3,i32 %P) nounwind readnone
+
+  %arrayidx6 = getelementptr inbounds float, float* %a, i32 3
+  %i6 = load float, float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float, float* %b, i32 3
+  %i7 = load float, float* %arrayidx7, align 4
+  %add4 = fadd float %i6, %i7
+  %call4 = tail call float @llvm.powi.f32(float %add4,i32 %Q) nounwind readnone
+
+  store float %call1, float* %c, align 4
+  %arrayidx8 = getelementptr inbounds float, float* %c, i32 1
+  store float %call2, float* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds float, float* %c, i32 2
+  store float %call3, float* %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds float, float* %c, i32 3
+  store float %call4, float* %arrayidx10, align 4
+  ret void
+
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
new file mode 100644
index 00000000000..0d314fdb7ba
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mattr=+sse4.2 | FileCheck %s
+
+@a = common local_unnamed_addr global [4 x i32] zeroinitializer, align 4
+@b = common local_unnamed_addr global [4 x i32] zeroinitializer, align 4
+
+define i32 @fn1() {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[REORDER_SHUFFLE]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 8, i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP6]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    ret i32 0
+;
+  entry:
+  %0 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4
+  %cmp = icmp sgt i32 %0, 0
+  %cond = select i1 %cmp, i32 8, i32 0
+  store i32 %cond, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i32 3), align 4
+  %1 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4
+  %cmp1 = icmp sgt i32 %1, 0
+  %. = select i1 %cmp1, i32 %1, i32 6
+  store i32 %., i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i32 0), align 4
+  %2 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4
+  %cmp4 = icmp sgt i32 %2, 0
+  %3 = select i1 %cmp4, i32 ptrtoint (i32 ()* @fn1 to i32), i32 0
+  store i32 %3, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i32 1), align 4
+  %4 = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
+  %cmp6 = icmp sgt i32 %4, 0
+  %5 = select i1 %cmp6, i32 ptrtoint (i32 ()* @fn1 to i32), i32 0
+  store i32 %5, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i32 2), align 4
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
new file mode 100644
index 00000000000..7856b7bd6b0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+;void jumble (int * restrict A, int * restrict B) {
+  ;  int tmp0 = A[10]*A[0];
+  ;  int tmp1 = A[11]*A[1];
+  ;  int tmp2 = A[12]*A[3];
+  ;  int tmp3 = A[13]*A[2];
+  ;  B[0] = tmp0;
+  ;  B[1] = tmp1;
+  ;  B[2] = tmp2;
+  ;  B[3] = tmp3;
+  ;}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @jumble1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP1]], [[REORDER_SHUFFLE]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %A, align 4
+  %mul = mul nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+  %3 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %2, %3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
+  %4 = load i32, i32* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
+  %5 = load i32, i32* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %4, %5
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
+  %6 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
+  %7 = load i32, i32* %arrayidx9, align 4
+  %mul10 = mul nsw i32 %6, %7
+  store i32 %mul, i32* %B, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %mul4, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %mul7, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %mul10, i32* %arrayidx14, align 4
+  ret void
+}
+
+;Reversing the operand of MUL
+
+; Function Attrs: norecurse nounwind uwtable
+define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @jumble2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[REORDER_SHUFFLE]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %A, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+  %3 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %3, %2
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
+  %4 = load i32, i32* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
+  %5 = load i32, i32* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %5, %4
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
+  %6 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
+  %7 = load i32, i32* %arrayidx9, align 4
+  %mul10 = mul nsw i32 %7, %6
+  store i32 %mul, i32* %B, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %mul4, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %mul7, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %mul10, i32* %arrayidx14, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
new file mode 100644
index 00000000000..568fd9f3ac7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+;void phiUsingLoads(int *restrict A, int *restrict B) {
+;  int tmp0, tmp1, tmp2, tmp3;
+;  for (int i = 0; i < 100; i++) {
+;    if (A[0] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 2];
+;      tmp3 = A[i + 3];
+;    } else if (A[25] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 2];
+;      tmp3 = A[i + 3];
+;    } else if (A[50] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 2];
+;      tmp3 = A[i + 3];
+;    } else if (A[75] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 3];
+;      tmp3 = A[i + 2];
+;    }
+;  }
+;  B[0] = tmp0;
+;  B[1] = tmp1;
+;  B[2] = tmp2;
+;  B[3] = tmp3;
+;}
+
+
+; Function Attrs: norecurse nounwind uwtable
+define void @phiUsingLoads(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @phiUsingLoads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 50
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 75
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE27:%.*]]
+; CHECK:       if.then14:
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX17]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else27:
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[CMP29:%.*]] = icmp eq i32 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[CMP29]], label [[IF_THEN30:%.*]], label [[IF_ELSE43:%.*]]
+; CHECK:       if.then30:
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX33]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else43:
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[CMP45:%.*]] = icmp eq i32 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[CMP45]], label [[IF_THEN46:%.*]], label [[FOR_INC]]
+; CHECK:       if.then46:
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 25
+  %arrayidx28 = getelementptr inbounds i32, i32* %A, i64 50
+  %arrayidx44 = getelementptr inbounds i32, i32* %A, i64 75
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  store i32 %tmp0.1, i32* %B, align 4
+  %arrayidx64 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %tmp1.1, i32* %arrayidx64, align 4
+  %arrayidx65 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %tmp2.1, i32* %arrayidx65, align 4
+  %arrayidx66 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %tmp3.1, i32* %arrayidx66, align 4
+  ret void
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %tmp3.0111 = phi i32 [ undef, %entry ], [ %tmp3.1, %for.inc ]
+  %tmp2.0110 = phi i32 [ undef, %entry ], [ %tmp2.1, %for.inc ]
+  %tmp1.0109 = phi i32 [ undef, %entry ], [ %tmp1.1, %for.inc ]
+  %tmp0.0108 = phi i32 [ undef, %entry ], [ %tmp0.1, %for.inc ]
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %2 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %2
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %4
+  %5 = load i32, i32* %arrayidx8, align 4
+  %6 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %6
+  %7 = load i32, i32* %arrayidx11, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %8 = load i32, i32* %arrayidx12, align 4
+  %cmp13 = icmp eq i32 %8, 0
+  br i1 %cmp13, label %if.then14, label %if.else27
+
+if.then14:                                        ; preds = %if.else
+  %arrayidx17 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %9 = load i32, i32* %arrayidx17, align 4
+  %10 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx20 = getelementptr inbounds i32, i32* %A, i64 %10
+  %11 = load i32, i32* %arrayidx20, align 4
+  %12 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx23 = getelementptr inbounds i32, i32* %A, i64 %12
+  %13 = load i32, i32* %arrayidx23, align 4
+  %14 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx26 = getelementptr inbounds i32, i32* %A, i64 %14
+  %15 = load i32, i32* %arrayidx26, align 4
+  br label %for.inc
+
+if.else27:                                        ; preds = %if.else
+  %16 = load i32, i32* %arrayidx28, align 4
+  %cmp29 = icmp eq i32 %16, 0
+  br i1 %cmp29, label %if.then30, label %if.else43
+
+if.then30:                                        ; preds = %if.else27
+  %arrayidx33 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %17 = load i32, i32* %arrayidx33, align 4
+  %18 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx36 = getelementptr inbounds i32, i32* %A, i64 %18
+  %19 = load i32, i32* %arrayidx36, align 4
+  %20 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx39 = getelementptr inbounds i32, i32* %A, i64 %20
+  %21 = load i32, i32* %arrayidx39, align 4
+  %22 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx42 = getelementptr inbounds i32, i32* %A, i64 %22
+  %23 = load i32, i32* %arrayidx42, align 4
+  br label %for.inc
+
+if.else43:                                        ; preds = %if.else27
+  %24 = load i32, i32* %arrayidx44, align 4
+  %cmp45 = icmp eq i32 %24, 0
+  br i1 %cmp45, label %if.then46, label %for.inc
+
+if.then46:                                        ; preds = %if.else43
+  %arrayidx49 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %25 = load i32, i32* %arrayidx49, align 4
+  %26 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx52 = getelementptr inbounds i32, i32* %A, i64 %26
+  %27 = load i32, i32* %arrayidx52, align 4
+  %28 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx55 = getelementptr inbounds i32, i32* %A, i64 %28
+  %29 = load i32, i32* %arrayidx55, align 4
+  %30 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx58 = getelementptr inbounds i32, i32* %A, i64 %30
+  %31 = load i32, i32* %arrayidx58, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then30, %if.else43, %if.then46, %if.then14
+  %tmp0.1 = phi i32 [ %1, %if.then ], [ %9, %if.then14 ], [ %17, %if.then30 ], [ %25, %if.then46 ], [ %tmp0.0108, %if.else43 ]
+  %tmp1.1 = phi i32 [ %3, %if.then ], [ %11, %if.then14 ], [ %19, %if.then30 ], [ %27, %if.then46 ], [ %tmp1.0109, %if.else43 ]
+  %tmp2.1 = phi i32 [ %5, %if.then ], [ %13, %if.then14 ], [ %21, %if.then30 ], [ %29, %if.then46 ], [ %tmp2.0110, %if.else43 ]
+  %tmp3.1 = phi i32 [ %7, %if.then ], [ %15, %if.then14 ], [ %23, %if.then30 ], [ %31, %if.then46 ], [ %tmp3.0111, %if.else43 ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
new file mode 100644
index 00000000000..4887fe6c5f9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+
+
+define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
+; CHECK-LABEL: @jumbled-load(
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
+  %load.1 = load i32, i32* %in.addr, align 4
+  %gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
+  %load.2 = load i32, i32* %gep.1, align 4
+  %gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
+  %load.3 = load i32, i32* %gep.2, align 4
+  %gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
+  %load.4 = load i32, i32* %gep.3, align 4
+  %inn.addr = getelementptr inbounds i32, i32* %inn, i64 0
+  %load.5 = load i32, i32* %inn.addr, align 4
+  %gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 2
+  %load.6 = load i32, i32* %gep.4, align 4
+  %gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 3
+  %load.7 = load i32, i32* %gep.5, align 4
+  %gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 1
+  %load.8 = load i32, i32* %gep.6, align 4
+  %mul.1 = mul i32 %load.3, %load.5
+  %mul.2 = mul i32 %load.2, %load.8
+  %mul.3 = mul i32 %load.4, %load.7
+  %mul.4 = mul i32 %load.1, %load.6
+  %gep.7 = getelementptr inbounds i32, i32* %out, i64 0
+  store i32 %mul.1, i32* %gep.7, align 4
+  %gep.8 = getelementptr inbounds i32, i32* %out, i64 1
+  store i32 %mul.2, i32* %gep.8, align 4
+  %gep.9 = getelementptr inbounds i32, i32* %out, i64 2
+  store i32 %mul.3, i32* %gep.9, align 4
+  %gep.10 = getelementptr inbounds i32, i32* %out, i64 3
+  store i32 %mul.4, i32* %gep.10, align 4
+
+  ret i32 undef
+}
+
+
+define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias nocapture %out) {
+; CHECK-LABEL: @jumbled-load-multiuses(
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[TMP10]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
+  %load.1 = load i32, i32* %in.addr, align 4
+  %gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 3
+  %load.2 = load i32, i32* %gep.1, align 4
+  %gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 1
+  %load.3 = load i32, i32* %gep.2, align 4
+  %gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 2
+  %load.4 = load i32, i32* %gep.3, align 4
+  %mul.1 = mul i32 %load.3, %load.4
+  %mul.2 = mul i32 %load.2, %load.2
+  %mul.3 = mul i32 %load.4, %load.1
+  %mul.4 = mul i32 %load.1, %load.3
+  %gep.7 = getelementptr inbounds i32, i32* %out, i64 0
+  store i32 %mul.1, i32* %gep.7, align 4
+  %gep.8 = getelementptr inbounds i32, i32* %out, i64 1
+  store i32 %mul.2, i32* %gep.8, align 4
+  %gep.9 = getelementptr inbounds i32, i32* %out, i64 2
+  store i32 %mul.3, i32* %gep.9, align 4
+  %gep.10 = getelementptr inbounds i32, i32* %out, i64 3
+  store i32 %mul.4, i32* %gep.10, align 4
+
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/limit.ll b/llvm/test/Transforms/SLPVectorizer/X86/limit.ll
new file mode 100644
index 00000000000..41db490a754
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/limit.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s --instcombine -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common global [4 x i32] zeroinitializer, align 16
+@c = common global [4 x i32] zeroinitializer, align 16
+@d = common global [4 x i32] zeroinitializer, align 16
+@e = common global [4 x i32] zeroinitializer, align 16
+@a = common global [4 x i32] zeroinitializer, align 16
+@fb = common global [4 x float] zeroinitializer, align 16
+@fc = common global [4 x float] zeroinitializer, align 16
+@fa = common global [4 x float] zeroinitializer, align 16
+@fd = common global [4 x float] zeroinitializer, align 16
+
+define void @addsub() {
+; CHECK-LABEL: @addsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @c to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @d to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @e to <4 x i32>*), align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  br label %bb1
+bb1:                                              ; preds = %entry
+  %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %add2 = add nsw i32 %add, %add1
+  store i32 %add2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %4, %5
+  %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4
+  %add4 = add nsw i32 %6, %7
+  %sub = sub nsw i32 %add3, %add4
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %add7 = add nsw i32 %add5, %add6
+  store i32 %add7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %sub10 = sub nsw i32 %add8, %add9
+  store i32 %sub10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/X86/lit.local.cfg
new file mode 100644
index 00000000000..e71f3cc4c41
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'X86' in config.root.targets:
+    config.unsupported = True
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
new file mode 100644
index 00000000000..df990be073b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s
+
+;unsigned load_le32(unsigned char *data) {
+;    unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24);
+;    return le32;
+;}
+
+define i32 @_Z9load_le32Ph(i8* nocapture readonly %data) {
+; CHECK-LABEL: @_Z9load_le32Ph(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[DATA:%.*]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[SHL3:%.*]] = shl nuw nsw i32 [[CONV2]], 8
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL3]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 16
+; CHECK-NEXT:    [[OR7:%.*]] = or i32 [[OR]], [[SHL6]]
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[DATA]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX8]], align 1
+; CHECK-NEXT:    [[CONV9:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[SHL10:%.*]] = shl nuw i32 [[CONV9]], 24
+; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[OR7]], [[SHL10]]
+; CHECK-NEXT:    ret i32 [[OR11]]
+;
+entry:
+  %0 = load i8, i8* %data, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds i8, i8* %data, i64 1
+  %1 = load i8, i8* %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %shl3 = shl nuw nsw i32 %conv2, 8
+  %or = or i32 %shl3, %conv
+  %arrayidx4 = getelementptr inbounds i8, i8* %data, i64 2
+  %2 = load i8, i8* %arrayidx4, align 1
+  %conv5 = zext i8 %2 to i32
+  %shl6 = shl nuw nsw i32 %conv5, 16
+  %or7 = or i32 %or, %shl6
+  %arrayidx8 = getelementptr inbounds i8, i8* %data, i64 3
+  %3 = load i8, i8* %arrayidx8, align 1
+  %conv9 = zext i8 %3 to i32
+  %shl10 = shl nuw i32 %conv9, 24
+  %or11 = or i32 %or7, %shl10
+  ret i32 %or11
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll
new file mode 100644
index 00000000000..ffbdd9f1d14
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/long_chains.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; At this point we can't vectorize only parts of the tree.
+
+define i32 @test(double* nocapture %A, i8* nocapture %B) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], <i8 3, i8 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8> undef, i8 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> [[TMP4]], i8 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP13]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[TMP14]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> [[TMP16]], <double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i8, i8* %B, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %B, i64 1
+  %1 = load i8, i8* %arrayidx1, align 1
+  %add = add i8 %0, 3
+  %add4 = add i8 %1, 3
+  %conv6 = sitofp i8 %add to double
+  %conv7 = sitofp i8 %add4 to double
+  %mul = fmul double %conv6, %conv6
+  %add8 = fadd double %mul, 1.000000e+00
+  %mul9 = fmul double %conv7, %conv7
+  %add10 = fadd double %mul9, 1.000000e+00
+  %mul11 = fmul double %add8, %add8
+  %add12 = fadd double %mul11, 1.000000e+00
+  %mul13 = fmul double %add10, %add10
+  %add14 = fadd double %mul13, 1.000000e+00
+  %mul15 = fmul double %add12, %add12
+  %add16 = fadd double %mul15, 1.000000e+00
+  %mul17 = fmul double %add14, %add14
+  %add18 = fadd double %mul17, 1.000000e+00
+  %mul19 = fmul double %add16, %add16
+  %add20 = fadd double %mul19, 1.000000e+00
+  %mul21 = fmul double %add18, %add18
+  %add22 = fadd double %mul21, 1.000000e+00
+  %mul23 = fmul double %add20, %add20
+  %add24 = fadd double %mul23, 1.000000e+00
+  %mul25 = fmul double %add22, %add22
+  %add26 = fadd double %mul25, 1.000000e+00
+  store double %add24, double* %A, align 8
+  %arrayidx28 = getelementptr inbounds double, double* %A, i64 1
+  store double %add26, double* %arrayidx28, align 8
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/loopinvariant.ll b/llvm/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
new file mode 100644
index 00000000000..020b50d5463
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
@@ -0,0 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @foo(i32* nocapture %A, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP62:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP62]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[INDVARS_IV]], 5
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[INDVARS_IV]], 6
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[INDVARS_IV]], 7
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> undef, i32 [[N]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[N]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[N]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[N]], i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[N]], i32 4
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[N]], i32 5
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[N]], i32 6
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[N]], i32 7
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[TMP8]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 8
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %cmp62 = icmp sgt i32 %n, 0
+  br i1 %cmp62, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %0, %n
+  store i32 %add1, i32* %arrayidx, align 4
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %1
+  %2 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %2, %n
+  store i32 %add5, i32* %arrayidx4, align 4
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %3
+  %4 = load i32, i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %4, %n
+  store i32 %add9, i32* %arrayidx8, align 4
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 %5
+  %6 = load i32, i32* %arrayidx12, align 4
+  %add13 = add nsw i32 %6, %n
+  store i32 %add13, i32* %arrayidx12, align 4
+  %7 = or i64 %indvars.iv, 4
+  %arrayidx16 = getelementptr inbounds i32, i32* %A, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %n
+  store i32 %add17, i32* %arrayidx16, align 4
+  %9 = or i64 %indvars.iv, 5
+  %arrayidx20 = getelementptr inbounds i32, i32* %A, i64 %9
+  %10 = load i32, i32* %arrayidx20, align 4
+  %add21 = add nsw i32 %10, %n
+  store i32 %add21, i32* %arrayidx20, align 4
+  %11 = or i64 %indvars.iv, 6
+  %arrayidx24 = getelementptr inbounds i32, i32* %A, i64 %11
+  %12 = load i32, i32* %arrayidx24, align 4
+  %add25 = add nsw i32 %12, %n
+  store i32 %add25, i32* %arrayidx24, align 4
+  %13 = or i64 %indvars.iv, 7
+  %arrayidx28 = getelementptr inbounds i32, i32* %A, i64 %13
+  %14 = load i32, i32* %arrayidx28, align 4
+  %add29 = add nsw i32 %14, %n
+  store i32 %add29, i32* %arrayidx28, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %15 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %15, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
new file mode 100644
index 00000000000..fdfd032f3ce
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @test1(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8, !tbaa !0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !tbaa !0
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]], !fpmath !4
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8, !tbaa !0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8, !tbaa !4
+  %i1 = load double, double* %b, align 8, !tbaa !4
+  %mul = fmul double %i0, %i1, !fpmath !0
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8, !tbaa !4
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8, !tbaa !4
+  %mul5 = fmul double %i3, %i4, !fpmath !0
+  store double %mul, double* %c, align 8, !tbaa !4
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8, !tbaa !4
+  ret void
+}
+
+define void @test2(double* %a, double* %b, i8* %e) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8, !tbaa !0
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !tbaa !0
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]], !fpmath !5
+; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8, !tbaa !0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8, !tbaa !4
+  %i1 = load double, double* %b, align 8, !tbaa !4
+  %mul = fmul double %i0, %i1, !fpmath !1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8, !tbaa !4
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8, !tbaa !4
+  %mul5 = fmul double %i3, %i4, !fpmath !1
+  %c = bitcast i8* %e to double*
+  store double %mul, double* %c, align 8, !tbaa !4
+  %carrayidx5 = getelementptr inbounds i8, i8* %e, i64 8
+  %arrayidx5 = bitcast i8* %carrayidx5 to double*
+  store double %mul5, double* %arrayidx5, align 8, !tbaa !4
+  ret void
+}
+
+;CHECK-DAG: !0 = !{[[TYPEC:!.*]], [[TYPEC]], i64 0}
+;CHECK-DAG: !4 = !{float 5.000000e+00}
+;CHECK-DAG: !5 = !{float 2.500000e+00}
+!0 = !{ float 5.0 }
+!1 = !{ float 2.5 }
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"omnipotent char", !2}
+!4 = !{!"double", !3}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
new file mode 100644
index 00000000000..4241cb922ea
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minimum-sizes.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-threshold=-6 -slp-vectorizer -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; These tests ensure that we do not regress due to PR31243. Note that we set
+; the SLP threshold to force vectorization even when not profitable.
+
+; When computing minimum sizes, if we can prove the sign bit is zero, we can
+; zero-extend the roots back to their original sizes.
+;
+define i8 @PR31243_zext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
+; CHECK-LABEL: @PR31243_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMPE4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMPE4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
+;
+entry:
+  %tmp0 = zext i8 %v0 to i32
+  %tmp1 = zext i8 %v1 to i32
+  %tmp2 = or i32 %tmp0, 1
+  %tmp3 = or i32 %tmp1, 1
+  %tmp4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp2
+  %tmp5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp3
+  %tmp6 = load i8, i8* %tmp4
+  %tmp7 = load i8, i8* %tmp5
+  %tmp8 = add i8 %tmp6, %tmp7
+  ret i8 %tmp8
+}
+
+; When computing minimum sizes, if we cannot prove the sign bit is zero, we
+; have to include one extra bit for signedness since we will sign-extend the
+; roots.
+;
+; FIXME: This test is suboptimal since the compuation can be performed in i8.
+;        In general, we need to add an extra bit to the maximum bit width only
+;        if we can't prove that the upper bit of the original type is equal to
+;        the upper bit of the proposed smaller type. If these two bits are the
+;        same (either zero or one) we know that sign-extending from the smaller
+;        type will result in the same value. Since we don't yet perform this
+;        optimization, we make the proposed smaller type (i8) larger (i16) to
+;        ensure correctness.
+;
+define i8 @PR31243_sext(i8 %v0, i8 %v1, i8 %v2, i8 %v3, i8* %ptr) {
+; CHECK-LABEL: @PR31243_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i8> undef, i8 [[V0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i8> [[TMP0]], i8 [[V1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i16 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, i8* [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i8 [[TMP8]]
+;
+entry:
+  %tmp0 = sext i8 %v0 to i32
+  %tmp1 = sext i8 %v1 to i32
+  %tmp2 = or i32 %tmp0, 1
+  %tmp3 = or i32 %tmp1, 1
+  %tmp4 = getelementptr inbounds i8, i8* %ptr, i32 %tmp2
+  %tmp5 = getelementptr inbounds i8, i8* %ptr, i32 %tmp3
+  %tmp6 = load i8, i8* %tmp4
+  %tmp7 = load i8, i8* %tmp5
+  %tmp8 = add i8 %tmp6, %tmp7
+  ret i8 %tmp8
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
new file mode 100644
index 00000000000..f785926cc41
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+; int bar(double *A, int d) {
+;   double A0 = A[0];
+;   double A1 = A[1];
+;   float F0 = A0;
+;   float F1 = A1;
+;   if (d) foo(); <----- This splits the blocks
+;   F0+=4.0;
+;   F1+=5.0;
+;   A[8] = 9.0 + F0;
+;   A[9] = 5.0 + F1;
+; }
+
+
+define i32 @bar(double* nocapture %A, i32 %d) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[D:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[TMP7:%.*]], label [[TMP5:%.*]]
+; CHECK:         [[TMP6:%.*]] = tail call i32 (...) @foo()
+; CHECK-NEXT:    br label [[TMP7]]
+; CHECK:         [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 9.000000e+00, double 5.000000e+00>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP9]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = load double, double* %A, align 8
+  %2 = getelementptr inbounds double, double* %A, i64 1
+  %3 = load double, double* %2, align 8
+  %4 = fptrunc double %1 to float
+  %5 = fptrunc double %3 to float
+  %6 = icmp eq i32 %d, 0
+  br i1 %6, label %9, label %7
+
+; <label>:7                                       ; preds = %0
+  %8 = tail call i32 (...) @foo()
+  br label %9
+
+; <label>:9                                       ; preds = %0, %7
+  %10 = fadd float %4, 4.000000e+00
+  %11 = fadd float %5, 5.000000e+00
+  %12 = fpext float %10 to double
+  %13 = fadd double %12, 9.000000e+00
+  %14 = getelementptr inbounds double, double* %A, i64 8
+  store double %13, double* %14, align 8
+  %15 = fpext float %11 to double
+  %16 = fadd double %15, 5.000000e+00
+  %17 = getelementptr inbounds double, double* %A, i64 9
+  store double %16, double* %17, align 8
+  ret i32 undef
+}
+
+declare i32 @foo(...)
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
new file mode 100644
index 00000000000..9268adf9481
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+;int foo (int *A, int n) {
+;  A[0] += n * 5 + 7;
+;  A[1] += n * 5 + 8;
+;  A[2] += n * 5 + 9;
+;  A[3] += n * 5 + 10;
+;  A[4] += n * 5 + 11;
+;}
+
+define i32 @foo(i32* nocapture %A, i32 %n) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i32 [[N:%.*]], 5
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 7, i32 8, i32 9, i32 10>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP1]], 11
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i32 [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    store i32 [[TMP14]], i32* [[TMP12]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = mul nsw i32 %n, 5
+  %2 = add nsw i32 %1, 7
+  %3 = load i32, i32* %A, align 4
+  %4 = add nsw i32 %2, %3
+  store i32 %4, i32* %A, align 4
+  %5 = add nsw i32 %1, 8
+  %6 = getelementptr inbounds i32, i32* %A, i64 1
+  %7 = load i32, i32* %6, align 4
+  %8 = add nsw i32 %5, %7
+  store i32 %8, i32* %6, align 4
+  %9 = add nsw i32 %1, 9
+  %10 = getelementptr inbounds i32, i32* %A, i64 2
+  %11 = load i32, i32* %10, align 4
+  %12 = add nsw i32 %9, %11
+  store i32 %12, i32* %10, align 4
+  %13 = add nsw i32 %1, 10
+  %14 = getelementptr inbounds i32, i32* %A, i64 3
+  %15 = load i32, i32* %14, align 4
+  %16 = add nsw i32 %13, %15
+  store i32 %16, i32* %14, align 4
+  %17 = add nsw i32 %1, 11
+  %18 = getelementptr inbounds i32, i32* %A, i64 4
+  %19 = load i32, i32* %18, align 4
+  %20 = add nsw i32 %17, %19
+  store i32 %20, i32* %18, align 4
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
new file mode 100644
index 00000000000..b7321aaf458
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/odd_store.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+;int foo(char * restrict A, float * restrict B, float T) {
+;  A[0] = (T * B[10] + 4.0);
+;  A[1] = (T * B[11] + 5.0);
+;  A[2] = (T * B[12] + 6.0);
+;}
+
+define i32 @foo(i8* noalias nocapture %A, float* noalias nocapture %B, float %T) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 10
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float [[TMP2]], [[T:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[TMP3]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 4.000000e+00
+; CHECK-NEXT:    [[TMP6:%.*]] = fptosi double [[TMP5]] to i8
+; CHECK-NEXT:    store i8 [[TMP6]], i8* [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[B]], i64 11
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul float [[TMP8]], [[T]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fpext float [[TMP9]] to double
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd double [[TMP10]], 5.000000e+00
+; CHECK-NEXT:    [[TMP12:%.*]] = fptosi double [[TMP11]] to i8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 1
+; CHECK-NEXT:    store i8 [[TMP12]], i8* [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 12
+; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP14]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP15]], [[T]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fpext float [[TMP16]] to double
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd double [[TMP17]], 6.000000e+00
+; CHECK-NEXT:    [[TMP19:%.*]] = fptosi double [[TMP18]] to i8
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 2
+; CHECK-NEXT:    store i8 [[TMP19]], i8* [[TMP20]], align 1
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = getelementptr inbounds float, float* %B, i64 10
+  %2 = load float, float* %1, align 4
+  %3 = fmul float %2, %T
+  %4 = fpext float %3 to double
+  %5 = fadd double %4, 4.000000e+00
+  %6 = fptosi double %5 to i8
+  store i8 %6, i8* %A, align 1
+  %7 = getelementptr inbounds float, float* %B, i64 11
+  %8 = load float, float* %7, align 4
+  %9 = fmul float %8, %T
+  %10 = fpext float %9 to double
+  %11 = fadd double %10, 5.000000e+00
+  %12 = fptosi double %11 to i8
+  %13 = getelementptr inbounds i8, i8* %A, i64 1
+  store i8 %12, i8* %13, align 1
+  %14 = getelementptr inbounds float, float* %B, i64 12
+  %15 = load float, float* %14, align 4
+  %16 = fmul float %15, %T
+  %17 = fpext float %16 to double
+  %18 = fadd double %17, 6.000000e+00
+  %19 = fptosi double %18 to i8
+  %20 = getelementptr inbounds i8, i8* %A, i64 2
+  store i8 %19, i8* %20, align 1
+  ret i32 undef
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
new file mode 100644
index 00000000000..726f04aafb1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -instcombine -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+
+; Make sure we order the operands of commutative operations so that we get
+; bigger vectorizable trees.
+
+define void @shuffle_operands1(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @shuffle_operands1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[V1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %v1
+  %v1_2 = fadd double %v2, %v0_2
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+  ret void
+}
+
+define void @vecload_vs_broadcast(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %p
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @vecload_vs_broadcast2(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_2, %v0_1
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @vecload_vs_broadcast3(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[P]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %p, %v0_1
+  %v1_2 = fadd double %v0_1, %v0_2
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @shuffle_preserve_broadcast4(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @shuffle_preserve_broadcast4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
+; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
+; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_2, %v0_1
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+define void @vecload_vs_broadcast5(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @vecload_vs_broadcast5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[FROM:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %p, %v0_1
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+
+define void @shuffle_preserve_broadcast6(double * noalias %from, double * noalias %to, double %v1, double %v2) {
+; CHECK-LABEL: @shuffle_preserve_broadcast6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LP:%.*]]
+; CHECK:       lp:
+; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, double* [[FROM:%.*]], i32 1
+; CHECK-NEXT:    [[V0_1:%.*]] = load double, double* [[FROM]], align 4
+; CHECK-NEXT:    [[V0_2:%.*]] = load double, double* [[FROM_1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0_1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[V0_2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[P]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 4
+; CHECK-NEXT:    br i1 undef, label [[LP]], label [[EXT:%.*]]
+; CHECK:       ext:
+; CHECK-NEXT:    ret void
+;
+entry:
+br label %lp
+
+lp:
+  %p = phi double [ 1.000000e+00, %lp ], [ 0.000000e+00, %entry ]
+  %from_1 = getelementptr double, double *%from, i64 1
+  %v0_1 = load double , double * %from
+  %v0_2 = load double , double * %from_1
+  %v1_1 = fadd double %v0_1, %v0_2
+  %v1_2 = fadd double %v0_1, %p
+  %to_2 = getelementptr double, double * %to, i64 1
+  store double %v1_1, double *%to
+  store double %v1_2, double *%to_2
+br i1 undef, label %lp, label %ext
+
+ext:
+  ret void
+}
+
+; Make sure we don't scramble operands when we reorder them and destroy
+; 'good' source order.
+
+@a = common global [32000 x float] zeroinitializer, align 16
+
+define void @good_load_order() {
+; CHECK-LABEL: @good_load_order(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 4
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
+; CHECK-NEXT:    [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    store float [[MUL45]], float* [[ARRAYIDX31]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995
+; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %0 = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i64 0, i64 0), align 16
+  br label %for.body3
+
+for.body3:
+  %1 = phi float [ %0, %for.cond1.preheader ], [ %10, %for.body3 ]
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %2 = add nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %2
+  %3 = load float, float* %arrayidx, align 4
+  %arrayidx5 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv
+  %mul6 = fmul float %3, %1
+  store float %mul6, float* %arrayidx5, align 4
+  %4 = add nsw i64 %indvars.iv, 2
+  %arrayidx11 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %4
+  %5 = load float, float* %arrayidx11, align 4
+  %mul15 = fmul float %5, %3
+  store float %mul15, float* %arrayidx, align 4
+  %6 = add nsw i64 %indvars.iv, 3
+  %arrayidx21 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %6
+  %7 = load float, float* %arrayidx21, align 4
+  %mul25 = fmul float %7, %5
+  store float %mul25, float* %arrayidx11, align 4
+  %8 = add nsw i64 %indvars.iv, 4
+  %arrayidx31 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %8
+  %9 = load float, float* %arrayidx31, align 4
+  %mul35 = fmul float %9, %7
+  store float %mul35, float* %arrayidx21, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %arrayidx41 = getelementptr inbounds [32000 x float], [32000 x float]* @a, i64 0, i64 %indvars.iv.next
+  %10 = load float, float* %arrayidx41, align 4
+  %mul45 = fmul float %10, %9
+  store float %mul45, float* %arrayidx31, align 4
+  %11 = trunc i64 %indvars.iv.next to i32
+  %cmp2 = icmp slt i32 %11, 31995
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.end:
+  ret void
+}
+
+; Check vectorization of following code for double data type-
+;  c[0] = a[0]+b[0];
+;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+
+define void @load_reorder_double(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b){
+; CHECK-LABEL: @load_reorder_double(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load double, double* %a
+  %2 = load double, double* %b
+  %3 = fadd double %1, %2
+  store double %3, double* %c
+  %4 = getelementptr inbounds double, double* %b, i64 1
+  %5 = load double, double* %4
+  %6 = getelementptr inbounds double, double* %a, i64 1
+  %7 = load double, double* %6
+  %8 = fadd double %5, %7
+  %9 = getelementptr inbounds double, double* %c, i64 1
+  store double %8, double* %9
+  ret void
+}
+
+; Check vectorization of following code for float data type-
+;  c[0] = a[0]+b[0];
+;  c[1] = b[1]+a[1]; // swapped b[1] and a[1]
+;  c[2] = a[2]+b[2];
+;  c[3] = a[3]+b[3];
+
+define void @load_reorder_float(float* nocapture %c, float* noalias nocapture readonly %a, float* noalias nocapture readonly %b){
+; CHECK-LABEL: @load_reorder_float(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP5]], <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* %a
+  %2 = load float, float* %b
+  %3 = fadd float %1, %2
+  store float %3, float* %c
+  %4 = getelementptr inbounds float, float* %b, i64 1
+  %5 = load float, float* %4
+  %6 = getelementptr inbounds float, float* %a, i64 1
+  %7 = load float, float* %6
+  %8 = fadd float %5, %7
+  %9 = getelementptr inbounds float, float* %c, i64 1
+  store float %8, float* %9
+  %10 = getelementptr inbounds float, float* %a, i64 2
+  %11 = load float, float* %10
+  %12 = getelementptr inbounds float, float* %b, i64 2
+  %13 = load float, float* %12
+  %14 = fadd float %11, %13
+  %15 = getelementptr inbounds float, float* %c, i64 2
+  store float %14, float* %15
+  %16 = getelementptr inbounds float, float* %a, i64 3
+  %17 = load float, float* %16
+  %18 = getelementptr inbounds float, float* %b, i64 3
+  %19 = load float, float* %18
+  %20 = fadd float %17, %19
+  %21 = getelementptr inbounds float, float* %c, i64 3
+  store float %20, float* %21
+  ret void
+}
+
+; Check we properly reorder the below code so that it gets vectorized optimally-
+; a[0] = (b[0]+c[0])+d[0];
+; a[1] = d[1]+(b[1]+c[1]);
+; a[2] = (b[2]+c[2])+d[2];
+; a[3] = (b[3]+c[3])+d[3];
+
+define void @opcode_reorder(float* noalias nocapture %a, float* noalias nocapture readonly %b, float* noalias nocapture readonly %c,float* noalias nocapture readonly %d) {
+; CHECK-LABEL: @opcode_reorder(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[D:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = load float, float* %b
+  %2 = load float, float* %c
+  %3 = fadd float %1, %2
+  %4 = load float, float* %d
+  %5 = fadd float %3, %4
+  store float %5, float* %a
+  %6 = getelementptr inbounds float, float* %d, i64 1
+  %7 = load float, float* %6
+  %8 = getelementptr inbounds float, float* %b, i64 1
+  %9 = load float, float* %8
+  %10 = getelementptr inbounds float, float* %c, i64 1
+  %11 = load float, float* %10
+  %12 = fadd float %9, %11
+  %13 = fadd float %7, %12
+  %14 = getelementptr inbounds float, float* %a, i64 1
+  store float %13, float* %14
+  %15 = getelementptr inbounds float, float* %b, i64 2
+  %16 = load float, float* %15
+  %17 = getelementptr inbounds float, float* %c, i64 2
+  %18 = load float, float* %17
+  %19 = fadd float %16, %18
+  %20 = getelementptr inbounds float, float* %d, i64 2
+  %21 = load float, float* %20
+  %22 = fadd float %19, %21
+  %23 = getelementptr inbounds float, float* %a, i64 2
+  store float %22, float* %23
+  %24 = getelementptr inbounds float, float* %b, i64 3
+  %25 = load float, float* %24
+  %26 = getelementptr inbounds float, float* %c, i64 3
+  %27 = load float, float* %26
+  %28 = fadd float %25, %27
+  %29 = getelementptr inbounds float, float* %d, i64 3
+  %30 = load float, float* %29
+  %31 = fadd float %28, %30
+  %32 = getelementptr inbounds float, float* %a, i64 3
+  store float %31, float* %32
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opt.ll b/llvm/test/Transforms/SLPVectorizer/X86/opt.ll
new file mode 100644
index 00000000000..97aa601b071
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opt.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=SLP
+; RUN: opt < %s -O3 -disable-slp-vectorization -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s --check-prefix=NOSLP
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure we can disable slp vectorization in opt.
+
+define void @test1(double* %a, double* %b, double* %c) {
+; SLP-LABEL: @test1(
+; SLP-NEXT:  entry:
+; SLP-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; SLP-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; SLP-NEXT:    [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; SLP-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; SLP-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
+; SLP-NEXT:    [[TMP5:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; SLP-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
+; SLP-NEXT:    ret void
+;
+; NOSLP-LABEL: @test1(
+; NOSLP-NEXT:  entry:
+; NOSLP-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; NOSLP-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; NOSLP-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; NOSLP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; NOSLP-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; NOSLP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; NOSLP-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; NOSLP-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; NOSLP-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; NOSLP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; NOSLP-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; NOSLP-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/ordering.ll
new file mode 100644
index 00000000000..bc6ef182c98
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ordering.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @updateModelQPFrame(i32 %m_Bits) {
+; CHECK-LABEL: @updateModelQPFrame(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load double, double* undef, align 8
+  %mul = fmul double undef, %0
+  %mul2 = fmul double undef, %mul
+  %mul4 = fmul double %0, %mul2
+  %mul5 = fmul double undef, 4.000000e+00
+  %mul7 = fmul double undef, %mul5
+  %conv = sitofp i32 %m_Bits to double
+  %mul8 = fmul double %conv, %mul7
+  %add = fadd double %mul4, %mul8
+  %cmp11 = fcmp olt double %add, 0.000000e+00
+  ret void
+}
+
+declare i8* @objc_msgSend(i8*, i8*, ...)
+declare i32 @personality_v0(...)
+
+define void @invoketest() personality i8* bitcast (i32 (...)* @personality_v0 to i8*) {
+; CHECK-LABEL: @invoketest(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    [[CALL49:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_TRUE54:%.*]] unwind label [[LPAD:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[CALL51:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_FALSE57:%.*]] unwind label [[LPAD]]
+; CHECK:       cond.true54:
+; CHECK-NEXT:    [[CALL56:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_END60:%.*]] unwind label [[LPAD]]
+; CHECK:       cond.false57:
+; CHECK-NEXT:    [[CALL59:%.*]] = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+; CHECK-NEXT:    to label [[COND_END60]] unwind label [[LPAD]]
+; CHECK:       cond.end60:
+; CHECK-NEXT:    br i1 undef, label [[IF_END98:%.*]], label [[IF_THEN63:%.*]]
+; CHECK:       if.then63:
+; CHECK-NEXT:    br label [[IF_END98]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[L:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    resume { i8*, i32 } [[L]]
+; CHECK:       if.end98:
+; CHECK-NEXT:    br label [[IF_END99:%.*]]
+; CHECK:       if.end99:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %cond.true, label %cond.false
+
+cond.true:
+  %call49 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.true54 unwind label %lpad
+
+cond.false:
+  %call51 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.false57 unwind label %lpad
+
+cond.true54:
+  %call56 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.end60 unwind label %lpad
+
+cond.false57:
+  %call59 = invoke double bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to double (i8*, i8*)*)(i8* undef, i8* undef)
+  to label %cond.end60 unwind label %lpad
+
+cond.end60:
+  %cond126 = phi double [ %call49, %cond.true54 ], [ %call51, %cond.false57 ]
+  %cond61 = phi double [ %call56, %cond.true54 ], [ %call59, %cond.false57 ]
+  br i1 undef, label %if.end98, label %if.then63
+
+if.then63:
+  %conv69 = fptrunc double undef to float
+  %conv70 = fpext float %conv69 to double
+  %div71 = fdiv double %cond126, %conv70
+  %conv78 = fptrunc double undef to float
+  %conv79 = fpext float %conv78 to double
+  %div80 = fdiv double %cond61, %conv79
+  br label %if.end98
+
+lpad:
+  %l = landingpad { i8*, i32 }
+  cleanup
+  resume { i8*, i32 } %l
+
+if.end98:
+  %dimensionsResult.sroa.0.0 = phi double [ %div71, %if.then63 ], [ %cond126, %cond.end60 ]
+  %dimensionsResult.sroa.6.0 = phi double [ %div80, %if.then63 ], [ %cond61, %cond.end60 ]
+  br label %if.end99
+
+if.end99:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
new file mode 100644
index 00000000000..0ed88a3c821
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define void @get_block(i32 %y_pos) local_unnamed_addr #0 {
+; CHECK-LABEL: @get_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LAND_LHS_TRUE:%.*]]
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end:
+; CHECK-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[Y_POS:%.*]], undef
+; CHECK-NEXT:    [[SHR15:%.*]] = ashr i32 [[SUB14]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[SHR15]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[SUB14]], i32 1
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], <i32 0, i32 -1, i32 -5, i32 -9>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[SHR15]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]]
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br label %land.lhs.true
+
+land.lhs.true:                                    ; preds = %entry
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %land.lhs.true
+  unreachable
+
+if.end:                                           ; preds = %land.lhs.true
+  %sub14 = sub nsw i32 %y_pos, undef
+  %shr15 = ashr i32 %sub14, 2
+  %cmp.i.i = icmp sgt i32 %shr15, 0
+  %cond.i.i = select i1 %cmp.i.i, i32 %shr15, i32 0
+  %cmp.i4.i = icmp slt i32 %cond.i.i, undef
+  %cond.i5.i = select i1 %cmp.i4.i, i32 %cond.i.i, i32 undef
+  %idxprom30 = sext i32 %cond.i5.i to i64
+  %arrayidx31 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30
+  %cmp.i.i.1 = icmp sgt i32 %sub14, -1
+  %cond.i.i.1 = select i1 %cmp.i.i.1, i32 undef, i32 0
+  %cmp.i4.i.1 = icmp slt i32 %cond.i.i.1, undef
+  %cond.i5.i.1 = select i1 %cmp.i4.i.1, i32 %cond.i.i.1, i32 undef
+  %idxprom30.1 = sext i32 %cond.i5.i.1 to i64
+  %arrayidx31.1 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30.1
+  %cmp.i.i.2 = icmp sgt i32 %sub14, -5
+  %cond.i.i.2 = select i1 %cmp.i.i.2, i32 undef, i32 0
+  %cmp.i4.i.2 = icmp slt i32 %cond.i.i.2, undef
+  %cond.i5.i.2 = select i1 %cmp.i4.i.2, i32 %cond.i.i.2, i32 undef
+  %idxprom30.2 = sext i32 %cond.i5.i.2 to i64
+  %arrayidx31.2 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30.2
+  %cmp.i.i.3 = icmp sgt i32 %sub14, -9
+  %cond.i.i.3 = select i1 %cmp.i.i.3, i32 undef, i32 0
+  %cmp.i4.i.3 = icmp slt i32 %cond.i.i.3, undef
+  %cond.i5.i.3 = select i1 %cmp.i4.i.3, i32 %cond.i.i.3, i32 undef
+  %idxprom30.3 = sext i32 %cond.i5.i.3 to i64
+  %arrayidx31.3 = getelementptr inbounds i16*, i16** undef, i64 %idxprom30.3
+  unreachable
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
new file mode 100644
index 00000000000..fe604e2652d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+;int foo(double *A, int k) {
+;  double A0;
+;  double A1;
+;  if (k) {
+;    A0 = 3;
+;    A1 = 5;
+;  } else {
+;    A0 = A[10];
+;    A1 = A[11];
+;  }
+;  A[0] = A0;
+;  A[1] = A1;
+;}
+
+
+define i32 @foo(double* nocapture %A, i32 %k) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[K:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 10
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[IF_ELSE]] ], [ <double 3.000000e+00, double 5.000000e+00>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %tobool = icmp eq i32 %k, 0
+  br i1 %tobool, label %if.else, label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds double, double* %A, i64 10
+  %0 = load double, double* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds double, double* %A, i64 11
+  %1 = load double, double* %arrayidx1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.else
+  %A0.0 = phi double [ %0, %if.else ], [ 3.000000e+00, %entry ]
+  %A1.0 = phi double [ %1, %if.else ], [ 5.000000e+00, %entry ]
+  store double %A0.0, double* %A, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %A, i64 1
+  store double %A1.0, double* %arrayidx3, align 8
+  ret i32 undef
+}
+
+
+;int foo(double * restrict B,  double * restrict A, int n, int m) {
+;  double R=A[1];
+;  double G=A[0];
+;  for (int i=0; i < 100; i++) {
+;    R += 10;
+;    G += 10;
+;    R *= 4;
+;    G *= 4;
+;    R += 4;
+;    G += 4;
+;  }
+;  B[0] = G;
+;  B[1] = R;
+;  return 0;
+;}
+
+define i32 @foo2(double* noalias nocapture %B, double* noalias nocapture %A, i32 %n, i32 %m) #0 {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %arrayidx = getelementptr inbounds double, double* %A, i64 1
+  %0 = load double, double* %arrayidx, align 8
+  %1 = load double, double* %A, align 8
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.019 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %G.018 = phi double [ %1, %entry ], [ %add5, %for.body ]
+  %R.017 = phi double [ %0, %entry ], [ %add4, %for.body ]
+  %add = fadd double %R.017, 1.000000e+01
+  %add2 = fadd double %G.018, 1.000000e+01
+  %mul = fmul double %add, 4.000000e+00
+  %mul3 = fmul double %add2, 4.000000e+00
+  %add4 = fadd double %mul, 4.000000e+00
+  %add5 = fadd double %mul3, 4.000000e+00
+  %inc = add nsw i32 %i.019, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  store double %add5, double* %B, align 8
+  %arrayidx7 = getelementptr inbounds double, double* %B, i64 1
+  store double %add4, double* %arrayidx7, align 8
+  ret i32 0
+}
+
+; float foo3(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   float Y = A[3];
+;   float P = A[4];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;     Y+=A[i+3]*10;
+;     P+=A[i+4]*11;
+;   }
+;
+;   return R+G+B+Y+P;
+; }
+
+define float @foo3(float* nocapture readonly %A) #0 {
+; CHECK-LABEL: @foo3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
+; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
+; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
+; CHECK-NEXT:    ret float [[ADD31]]
+;
+entry:
+  %0 = load float, float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %A, i64 1
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 2
+  %2 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %A, i64 3
+  %3 = load float, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %A, i64 4
+  %4 = load float, float* %arrayidx4, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %P.056 = phi float [ %4, %entry ], [ %add26, %for.body ]
+  %Y.055 = phi float [ %3, %entry ], [ %add21, %for.body ]
+  %B.054 = phi float [ %2, %entry ], [ %add16, %for.body ]
+  %G.053 = phi float [ %1, %entry ], [ %add11, %for.body ]
+  %R.052 = phi float [ %0, %entry ], [ %add6, %for.body ]
+  %5 = phi float [ %1, %entry ], [ %11, %for.body ]
+  %6 = phi float [ %0, %entry ], [ %9, %for.body ]
+  %mul = fmul float %6, 7.000000e+00
+  %add6 = fadd float %R.052, %mul
+  %mul10 = fmul float %5, 8.000000e+00
+  %add11 = fadd float %G.053, %mul10
+  %7 = add nsw i64 %indvars.iv, 2
+  %arrayidx14 = getelementptr inbounds float, float* %A, i64 %7
+  %8 = load float, float* %arrayidx14, align 4
+  %mul15 = fmul float %8, 9.000000e+00
+  %add16 = fadd float %B.054, %mul15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds float, float* %A, i64 %indvars.iv.next
+  %9 = load float, float* %arrayidx19, align 4
+  %mul20 = fmul float %9, 1.000000e+01
+  %add21 = fadd float %Y.055, %mul20
+  %10 = add nsw i64 %indvars.iv, 4
+  %arrayidx24 = getelementptr inbounds float, float* %A, i64 %10
+  %11 = load float, float* %arrayidx24, align 4
+  %mul25 = fmul float %11, 1.100000e+01
+  %add26 = fadd float %P.056, %mul25
+  %12 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %12, 121
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float %add6, %add11
+  %add29 = fadd float %add28, %add16
+  %add30 = fadd float %add29, %add21
+  %add31 = fadd float %add30, %add26
+  ret float %add31
+}
+
+; Make sure the order of phi nodes of different types does not prevent
+; vectorization of same typed phi nodes.
+define float @sort_phi_type(float* nocapture readonly %A) {
+; CHECK-LABEL: @sort_phi_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x float> [ <float 1.000000e+01, float 1.000000e+01, float 1.000000e+01, float 1.000000e+01>, [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP9]] = fmul <4 x float> [[TMP8]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP9]], i32 1
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i32 2
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP9]], i32 3
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP13]]
+; CHECK-NEXT:    ret float [[ADD31]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %Y = phi float [ 1.000000e+01, %entry ], [ %mul10, %for.body ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %B = phi float [ 1.000000e+01, %entry ], [ %mul15, %for.body ]
+  %G = phi float [ 1.000000e+01, %entry ], [ %mul20, %for.body ]
+  %R = phi float [ 1.000000e+01, %entry ], [ %mul25, %for.body ]
+  %mul10 = fmul float %Y, 8.000000e+00
+  %mul15 = fmul float %B, 9.000000e+00
+  %mul20 = fmul float %R, 10.000000e+01
+  %mul25 = fmul float %G, 11.100000e+01
+  %indvars.iv.next = add nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 128
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %add28 = fadd float 1.000000e+01, %mul10
+  %add29 = fadd float %mul10, %mul15
+  %add30 = fadd float %add29, %mul20
+  %add31 = fadd float %add30, %mul25
+  ret float %add31
+}
+
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1_0:%.*]] = load x86_fp80, x86_fp80* [[I1:%.*]], align 16
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr x86_fp80, x86_fp80* [[I1]], i64 1
+; CHECK-NEXT:    [[I1_1:%.*]] = load x86_fp80, x86_fp80* [[I1_GEP1]], align 16
+; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2:%.*]], i64 0
+; CHECK-NEXT:    [[I2_0:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP0]], align 16
+; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2]], i64 1
+; CHECK-NEXT:    [[I2_1:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP1]], align 16
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI0:%.*]] = phi x86_fp80 [ [[I1_0]], [[ENTRY:%.*]] ], [ [[I2_0]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI1:%.*]] = phi x86_fp80 [ [[I1_1]], [[ENTRY]] ], [ [[I2_1]], [[THEN]] ]
+; CHECK-NEXT:    store x86_fp80 [[PHI0]], x86_fp80* [[O:%.*]], align 16
+; CHECK-NEXT:    [[O_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[O]], i64 1
+; CHECK-NEXT:    store x86_fp80 [[PHI1]], x86_fp80* [[O_GEP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+; We disable the vectorization of x86_fp80 for now.
+
+entry:
+  %i1.0 = load x86_fp80, x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80, x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80, x86_fp80* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80, x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80, x86_fp80* %i2.gep1, align 16
+  br label %end
+
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll
new file mode 100644
index 00000000000..b08d9abf9ff
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi3.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.GPar.0.16.26 = type { [0 x double], double }
+
+@d = external global double, align 8
+
+declare %struct.GPar.0.16.26* @Rf_gpptr(...)
+
+define void @Rf_GReset() {
+; CHECK-LABEL: @Rf_GReset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* @d, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    br i1 icmp eq (%struct.GPar.0.16.26* (...)* inttoptr (i64 115 to %struct.GPar.0.16.26* (...)*), %struct.GPar.0.16.26* (...)* @Rf_gpptr), label [[IF_THEN:%.*]], label [[IF_END7:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
+; CHECK-NEXT:    [[TMP4:%.*]] = fdiv <2 x double> [[TMP3]], undef
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN6:%.*]], label [[IF_END7]]
+; CHECK:       if.then6:
+; CHECK-NEXT:    br label [[IF_END7]]
+; CHECK:       if.end7:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub = fsub double -0.000000e+00, undef
+  %0 = load double, double* @d, align 8
+  %sub1 = fsub double -0.000000e+00, %0
+  br i1 icmp eq (%struct.GPar.0.16.26* (...)* inttoptr (i64 115 to %struct.GPar.0.16.26* (...)*), %struct.GPar.0.16.26* (...)* @Rf_gpptr), label %if.then, label %if.end7
+
+if.then:                                          ; preds = %entry
+  %sub2 = fsub double %sub, undef
+  %div.i = fdiv double %sub2, undef
+  %sub4 = fsub double %sub1, undef
+  %div.i16 = fdiv double %sub4, undef
+  %cmp = fcmp ogt double %div.i, %div.i16
+  br i1 %cmp, label %if.then6, label %if.end7
+
+if.then6:                                         ; preds = %if.then
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.then6, %if.then, %entry
+  %g.0 = phi double [ 0.000000e+00, %if.then6 ], [ %sub, %if.then ], [ %sub, %entry ]
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
new file mode 100644
index 00000000000..0a752889065
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_landingpad.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -S -o - | FileCheck %s
+
+target datalayout = "f64:64:64-v64:64:64"
+
+define void @test_phi_in_landingpad() personality i8*
+; CHECK-LABEL: @test_phi_in_landingpad(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    invoke void @foo()
+; CHECK-NEXT:    to label [[INNER:%.*]] unwind label [[LPAD:%.*]]
+; CHECK:       inner:
+; CHECK-NEXT:    invoke void @foo()
+; CHECK-NEXT:    to label [[DONE:%.*]] unwind label [[LPAD]]
+; CHECK:       lpad:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ undef, [[ENTRY:%.*]] ], [ undef, [[INNER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    catch i8* null
+; CHECK-NEXT:    br label [[DONE]]
+; CHECK:       done:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ undef, [[INNER]] ], [ [[TMP0]], [[LPAD]] ]
+; CHECK-NEXT:    ret void
+;
+  bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @foo()
+  to label %inner unwind label %lpad
+
+inner:
+  %x0 = fsub double undef, undef
+  %y0 = fsub double undef, undef
+  invoke void @foo()
+  to label %done unwind label %lpad
+
+lpad:
+  %x1 = phi double [ undef, %entry ], [ undef, %inner ]
+  %y1 = phi double [ undef, %entry ], [ undef, %inner ]
+  landingpad { i8*, i32 } catch i8* null
+  br label %done
+
+done:
+  phi double [ %x0, %inner ], [ %x1, %lpad ]
+  phi double [ %y0, %inner ], [ %y1, %lpad ]
+  ret void
+}
+
+declare void @foo()
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
new file mode 100644
index 00000000000..f708341877a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+; We purposely over-align f64 to 128bit here.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:128:128-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+
+define void @test(double* %i1, double* %i2, double* %o) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1_0:%.*]] = load double, double* [[I1:%.*]], align 16
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr double, double* [[I1]], i64 1
+; CHECK-NEXT:    [[I1_1:%.*]] = load double, double* [[I1_GEP1]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[I1_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1
+; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds double, double* [[I2:%.*]], i64 0
+; CHECK-NEXT:    [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16
+; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1
+; CHECK-NEXT:    [[I2_1:%.*]] = load double, double* [[I2_GEP1]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[I2_0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I2_1]], i32 1
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[TMP3]], [[THEN]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    store double [[TMP5]], double* [[O:%.*]], align 16
+; CHECK-NEXT:    [[O_GEP1:%.*]] = getelementptr inbounds double, double* [[O]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    store double [[TMP6]], double* [[O_GEP1]], align 16
+; CHECK-NEXT:    ret void
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+
+entry:
+  %i1.0 = load double, double* %i1, align 16
+  %i1.gep1 = getelementptr double, double* %i1, i64 1
+  %i1.1 = load double, double* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+
+then:
+  %i2.gep0 = getelementptr inbounds double, double* %i2, i64 0
+  %i2.0 = load double, double* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds double, double* %i2, i64 1
+  %i2.1 = load double, double* %i2.gep1, align 16
+  br label %end
+
+end:
+  %phi0 = phi double [ %i1.0, %entry ], [ %i2.0, %then ]
+  %phi1 = phi double [ %i1.1, %entry ], [ %i2.1, %then ]
+  store double %phi0, double* %o, align 16
+  %o.gep1 = getelementptr inbounds double, double* %o, i64 1
+  store double %phi1, double* %o.gep1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
new file mode 100644
index 00000000000..d7f80cf9ff5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX1
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+
+define void @powof2div_uniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
+; CHECK-LABEL: @powof2div_uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = load i32, i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 1
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 2
+  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 2
+  %4 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 2
+  %5 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 2
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 3
+  %6 = load i32, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 3
+  %7 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 2
+  %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}
+
+define void @powof2div_nonuniform(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){
+; AVX1-LABEL: @powof2div_nonuniform(
+; AVX1-NEXT:  entry:
+; AVX1-NEXT:    [[TMP0:%.*]] = load i32, i32* [[B:%.*]], align 4
+; AVX1-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; AVX1-NEXT:    [[DIV:%.*]] = sdiv i32 [[ADD]], 2
+; AVX1-NEXT:    store i32 [[DIV]], i32* [[A:%.*]], align 4
+; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 1
+; AVX1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 1
+; AVX1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
+; AVX1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP3]], [[TMP2]]
+; AVX1-NEXT:    [[DIV6:%.*]] = sdiv i32 [[ADD5]], 4
+; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; AVX1-NEXT:    store i32 [[DIV6]], i32* [[ARRAYIDX7]], align 4
+; AVX1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; AVX1-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
+; AVX1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; AVX1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
+; AVX1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP5]], [[TMP4]]
+; AVX1-NEXT:    [[DIV11:%.*]] = sdiv i32 [[ADD10]], 8
+; AVX1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX1-NEXT:    store i32 [[DIV11]], i32* [[ARRAYIDX12]], align 4
+; AVX1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; AVX1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; AVX1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; AVX1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX14]], align 4
+; AVX1-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP7]], [[TMP6]]
+; AVX1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[ADD15]], 16
+; AVX1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; AVX1-NEXT:    store i32 [[DIV16]], i32* [[ARRAYIDX17]], align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @powof2div_nonuniform(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; AVX2-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
+; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1
+; AVX2-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; AVX2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2
+; AVX2-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; AVX2-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; AVX2-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; AVX2-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3
+; AVX2-NEXT:    [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; AVX2-NEXT:    [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], <i32 2, i32 4, i32 8, i32 16>
+; AVX2-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; AVX2-NEXT:    [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; AVX2-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; AVX2-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* %b, align 4
+  %1 = load i32, i32* %c, align 4
+  %add = add nsw i32 %1, %0
+  %div = sdiv i32 %add, 2
+  store i32 %div, i32* %a, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %b, i64 1
+  %2 = load i32, i32* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %c, i64 1
+  %3 = load i32, i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %3, %2
+  %div6 = sdiv i32 %add5, 4
+  %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 %div6, i32* %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, i32* %b, i64 2
+  %4 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %c, i64 2
+  %5 = load i32, i32* %arrayidx9, align 4
+  %add10 = add nsw i32 %5, %4
+  %div11 = sdiv i32 %add10, 8
+  %arrayidx12 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 %div11, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %b, i64 3
+  %6 = load i32, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %c, i64 3
+  %7 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %7, %6
+  %div16 = sdiv i32 %add15, 16
+  %arrayidx17 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 %div16, i32* %arrayidx17, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16571.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16571.ll
new file mode 100644
index 00000000000..13d82149c0c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16571.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s -slp-vectorizer -S -mtriple=i686-pc-win32 -mcpu=corei7-avx
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i686-pc-win32"
+
+define hidden fastcc void @"System.PrimitiveTypesParser.TryParseIEEE754<char>(char*,uint,double&)"() unnamed_addr {
+"@0":
+  br i1 undef, label %"@38.lr.ph", label %"@37"
+
+"@37":                                            ; preds = %"@38.lr.ph", %"@44", %"@0"
+  ret void
+
+"@44":                                            ; preds = %"@38.lr.ph"
+  %0 = add i64 undef, undef
+  %1 = add i32 %mainPartDigits.loc.0.ph45, 1
+  br i1 undef, label %"@38.lr.ph", label %"@37"
+
+"@38.lr.ph":                                      ; preds = %"@44", %"@0"
+  %mainDoublePart.loc.0.ph46 = phi i64 [ %0, %"@44" ], [ 0, %"@0" ]
+  %mainPartDigits.loc.0.ph45 = phi i32 [ %1, %"@44" ], [ 0, %"@0" ]
+  br i1 undef, label %"@44", label %"@37"
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16628.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16628.ll
new file mode 100644
index 00000000000..9f195649c63
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16628.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+@c = common global i32 0, align 4
+@a = common global i16 0, align 2
+@b = common global i16 0, align 2
+
+; Function Attrs: nounwind ssp uwtable
+define void @f() {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (...) @g()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @c, align 4
+; CHECK-NEXT:    [[LNOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[LNOT]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* @a, align 2
+; CHECK-NEXT:    [[LNOT2:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    [[LNOT_EXT3:%.*]] = zext i1 [[LNOT2]] to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[LNOT_EXT3]], [[LNOT_EXT]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], [[OR]]
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    store i16 [[CONV4]], i16* @b, align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = tail call i32 (...) @g()
+  %0 = load i32, i32* @c, align 4
+  %lnot = icmp eq i32 %0, 0
+  %lnot.ext = zext i1 %lnot to i32
+  %1 = load i16, i16* @a, align 2
+  %lnot2 = icmp eq i16 %1, 0
+  %lnot.ext3 = zext i1 %lnot2 to i32
+  %or = or i32 %lnot.ext3, %lnot.ext
+  %cmp = icmp eq i32 %call, %or
+  %conv4 = zext i1 %cmp to i16
+  store i16 %conv4, i16* @b, align 2
+  ret void
+}
+
+declare i32 @g(...)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
new file mode 100644
index 00000000000..5b91c3072b1
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s  -slp-vectorizer -S -mtriple=i386--netbsd -mcpu=i486 | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+@a = common global i32* null, align 4
+
+; Function Attrs: noreturn nounwind readonly
+define i32 @fn1() #0 {
+; CHECK-LABEL: @fn1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** @a, align 4, !tbaa !0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4, !tbaa !4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4, !tbaa !4
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[C_0:%.*]] = phi i32 [ [[TMP2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[B_0:%.*]] = phi i32 [ [[TMP1]], [[ENTRY]] ], [ [[ADD:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[B_0]], [[C_0]]
+; CHECK-NEXT:    [[ADD2]] = add nsw i32 [[ADD]], 1
+; CHECK-NEXT:    br label [[DO_BODY]]
+;
+entry:
+  %0 = load i32*, i32** @a, align 4, !tbaa !4
+  %1 = load i32, i32* %0, align 4, !tbaa !5
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i32 1
+  %2 = load i32, i32* %arrayidx1, align 4, !tbaa !5
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %c.0 = phi i32 [ %2, %entry ], [ %add2, %do.body ]
+  %b.0 = phi i32 [ %1, %entry ], [ %add, %do.body ]
+  %add = add nsw i32 %b.0, %c.0
+  %add2 = add nsw i32 %add, 1
+  br label %do.body
+}
+
+attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!"any pointer", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"int", !1}
+!4 = !{!0, !0, i64 0}
+!5 = !{!3, !3, i64 0}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr18060.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr18060.ll
new file mode 100644
index 00000000000..0af5d0fa053
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr18060.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=i386-pc-linux | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-pc-linux"
+
+; Function Attrs: nounwind
+define i32 @_Z16adjustFixupValueyj(i64 %Value, i32 %Kind) {
+; CHECK-LABEL: @_Z16adjustFixupValueyj(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i64 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[EXTRACT:%.*]] = lshr i64 [[VALUE]], 12
+; CHECK-NEXT:    [[EXTRACT_T6:%.*]] = trunc i64 [[EXTRACT]] to i32
+; CHECK-NEXT:    switch i32 [[KIND:%.*]], label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[RETURN:%.*]]
+; CHECK-NEXT:    i32 1, label [[RETURN]]
+; CHECK-NEXT:    i32 129, label [[SW_BB1:%.*]]
+; CHECK-NEXT:    i32 130, label [[SW_BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default:
+; CHECK-NEXT:    call void @_Z25llvm_unreachable_internalv()
+; CHECK-NEXT:    unreachable
+; CHECK:       sw.bb1:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i64 [[VALUE]], 16
+; CHECK-NEXT:    [[EXTRACT_T5:%.*]] = trunc i64 [[SHR]] to i32
+; CHECK-NEXT:    [[EXTRACT7:%.*]] = lshr i64 [[VALUE]], 28
+; CHECK-NEXT:    [[EXTRACT_T8:%.*]] = trunc i64 [[EXTRACT7]] to i32
+; CHECK-NEXT:    br label [[SW_BB2]]
+; CHECK:       sw.bb2:
+; CHECK-NEXT:    [[VALUE_ADDR_0_OFF0:%.*]] = phi i32 [ [[EXTRACT_T]], [[ENTRY:%.*]] ], [ [[EXTRACT_T5]], [[SW_BB1]] ]
+; CHECK-NEXT:    [[VALUE_ADDR_0_OFF12:%.*]] = phi i32 [ [[EXTRACT_T6]], [[ENTRY]] ], [ [[EXTRACT_T8]], [[SW_BB1]] ]
+; CHECK-NEXT:    [[CONV6:%.*]] = and i32 [[VALUE_ADDR_0_OFF0]], 4095
+; CHECK-NEXT:    [[CONV4:%.*]] = shl i32 [[VALUE_ADDR_0_OFF12]], 16
+; CHECK-NEXT:    [[SHL:%.*]] = and i32 [[CONV4]], 983040
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[CONV6]]
+; CHECK-NEXT:    [[OR11:%.*]] = or i32 [[OR]], 8388608
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[OR11]], [[SW_BB2]] ], [ [[EXTRACT_T]], [[ENTRY]] ], [ [[EXTRACT_T]], [[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %extract.t = trunc i64 %Value to i32
+  %extract = lshr i64 %Value, 12
+  %extract.t6 = trunc i64 %extract to i32
+  switch i32 %Kind, label %sw.default [
+  i32 0, label %return
+  i32 1, label %return
+  i32 129, label %sw.bb1
+  i32 130, label %sw.bb2
+  ]
+
+sw.default:                                       ; preds = %entry
+  call void @_Z25llvm_unreachable_internalv()
+  unreachable
+
+sw.bb1:                                           ; preds = %entry
+  %shr = lshr i64 %Value, 16
+  %extract.t5 = trunc i64 %shr to i32
+  %extract7 = lshr i64 %Value, 28
+  %extract.t8 = trunc i64 %extract7 to i32
+  br label %sw.bb2
+
+sw.bb2:                                           ; preds = %sw.bb1, %entry
+  %Value.addr.0.off0 = phi i32 [ %extract.t, %entry ], [ %extract.t5, %sw.bb1 ]
+  %Value.addr.0.off12 = phi i32 [ %extract.t6, %entry ], [ %extract.t8, %sw.bb1 ]
+  %conv6 = and i32 %Value.addr.0.off0, 4095
+  %conv4 = shl i32 %Value.addr.0.off12, 16
+  %shl = and i32 %conv4, 983040
+  %or = or i32 %shl, %conv6
+  %or11 = or i32 %or, 8388608
+  br label %return
+
+return:                                           ; preds = %sw.bb2, %entry, %entry
+  %retval.0 = phi i32 [ %or11, %sw.bb2 ], [ %extract.t, %entry ], [ %extract.t, %entry ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: noreturn
+declare void @_Z25llvm_unreachable_internalv()
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr19657.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr19657.ll
new file mode 100644
index 00000000000..2bde319c175
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr19657.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-max-reg-size=128 -S -mcpu=corei7-avx | FileCheck %s --check-prefix=V128
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(double* %x) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, double* [[X]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[X]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[X]] to <4 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x double> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[X]] to <4 x double>*
+; CHECK-NEXT:    store <4 x double> [[TMP7]], <4 x double>* [[TMP8]], align 8
+; CHECK-NEXT:    ret void
+;
+; V128-LABEL: @foo(
+; V128-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 1
+; V128-NEXT:    [[TMP2:%.*]] = bitcast double* [[X]] to <2 x double>*
+; V128-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; V128-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP3]]
+; V128-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP3]]
+; V128-NEXT:    [[TMP6:%.*]] = bitcast double* [[X]] to <2 x double>*
+; V128-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; V128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, double* [[X]], i64 2
+; V128-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, double* [[X]], i64 3
+; V128-NEXT:    [[TMP9:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
+; V128-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8
+; V128-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], [[TMP10]]
+; V128-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], [[TMP10]]
+; V128-NEXT:    [[TMP13:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
+; V128-NEXT:    store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8
+; V128-NEXT:    ret void
+;
+  %1 = load double, double* %x, align 8
+  %2 = fadd double %1, %1
+  %3 = fadd double %2, %1
+  store double %3, double* %x, align 8
+  %4 = getelementptr inbounds double, double* %x, i64 1
+  %5 = load double, double* %4, align 8
+  %6 = fadd double %5, %5
+  %7 = fadd double %6, %5
+  store double %7, double* %4, align 8
+  %8 = getelementptr inbounds double, double* %x, i64 2
+  %9 = load double, double* %8, align 8
+  %10 = fadd double %9, %9
+  %11 = fadd double %10, %9
+  store double %11, double* %8, align 8
+  %12 = getelementptr inbounds double, double* %x, i64 3
+  %13 = load double, double* %12, align 8
+  %14 = fadd double %13, %13
+  %15 = fadd double %14, %13
+  store double %15, double* %12, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr23510.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr23510.ll
new file mode 100644
index 00000000000..420fdde3631
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr23510.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; PR23510
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@total = global i64 0, align 8
+
+define void @_Z3fooPml(i64* nocapture %a, i64 %i) {
+; CHECK-LABEL: @_Z3fooPml(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* @total, align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    store i64 [[ADD]], i64* @total, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr <2 x i64> [[TMP5]], <i64 4, i64 4>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* @total, align 8
+; CHECK-NEXT:    [[ADD9:%.*]] = add i64 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    store i64 [[ADD9]], i64* @total, align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = load i64, i64* %a, align 8
+  %shr = lshr i64 %tmp, 4
+  store i64 %shr, i64* %a, align 8
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1
+  %tmp1 = load i64, i64* %arrayidx1, align 8
+  %shr2 = lshr i64 %tmp1, 4
+  store i64 %shr2, i64* %arrayidx1, align 8
+  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp2 = load i64, i64* %arrayidx3, align 8
+  %tmp3 = load i64, i64* @total, align 8
+  %add = add i64 %tmp3, %tmp2
+  store i64 %add, i64* @total, align 8
+  %tmp4 = load i64, i64* %a, align 8
+  %shr5 = lshr i64 %tmp4, 4
+  store i64 %shr5, i64* %a, align 8
+  %tmp5 = load i64, i64* %arrayidx1, align 8
+  %shr7 = lshr i64 %tmp5, 4
+  store i64 %shr7, i64* %arrayidx1, align 8
+  %tmp6 = load i64, i64* %arrayidx3, align 8
+  %tmp7 = load i64, i64* @total, align 8
+  %add9 = add i64 %tmp7, %tmp6
+  store i64 %add9, i64* @total, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll
new file mode 100644
index 00000000000..b1c1d9540d0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr27163.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S < %s | FileCheck %s
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc18.0.0"
+
+%struct.B = type { i64, i64 }
+
+define void @test1(%struct.B* %p) personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  invoke.cont:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], %struct.B* [[P:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [[STRUCT_B]], %struct.B* [[P]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[GEP1]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[GEP1]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    invoke void @throw()
+; CHECK-NEXT:    to label [[UNREACHABLE:%.*]] unwind label [[CATCH_DISPATCH:%.*]]
+; CHECK:       catch.dispatch:
+; CHECK-NEXT:    [[CS:%.*]] = catchswitch within none [label %invoke.cont1] unwind label [[EHCLEANUP:%.*]]
+; CHECK:       invoke.cont1:
+; CHECK-NEXT:    [[CATCH:%.*]] = catchpad within [[CS]] [i8* null, i32 64, i8* null]
+; CHECK-NEXT:    invoke void @throw() [ "funclet"(token [[CATCH]]) ]
+; CHECK-NEXT:    to label [[UNREACHABLE]] unwind label [[EHCLEANUP]]
+; CHECK:       ehcleanup:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP2]], [[CATCH_DISPATCH]] ], [ 9, [[INVOKE_CONT1:%.*]] ]
+; CHECK-NEXT:    [[CLEANUP:%.*]] = cleanuppad within none []
+; CHECK-NEXT:    call void @release(i64 [[PHI]]) [ "funclet"(token [[CLEANUP]]) ]
+; CHECK-NEXT:    cleanupret from [[CLEANUP]] unwind to caller
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+;
+invoke.cont:
+  %gep1 = getelementptr inbounds %struct.B, %struct.B* %p, i64 0, i32 0
+  %gep2 = getelementptr inbounds %struct.B, %struct.B* %p, i64 0, i32 1
+  %load1 = load i64, i64* %gep1, align 8
+  %load2 = load i64, i64* %gep2, align 8
+  store i64 %load1, i64* %gep1, align 8
+  store i64 %load2, i64* %gep2, align 8
+  invoke void @throw()
+  to label %unreachable unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %invoke.cont
+  %cs = catchswitch within none [label %invoke.cont1] unwind label %ehcleanup
+
+invoke.cont1:                                     ; preds = %catch.dispatch
+  %catch = catchpad within %cs [i8* null, i32 64, i8* null]
+  invoke void @throw() [ "funclet"(token %catch) ]
+  to label %unreachable unwind label %ehcleanup
+
+ehcleanup:                                        ; preds = %invoke.cont1, %catch.dispatch
+  %phi = phi i64 [ %load1, %catch.dispatch ], [ 9, %invoke.cont1 ]
+  %cleanup = cleanuppad within none []
+  call void @release(i64 %phi) [ "funclet"(token %cleanup) ]
+  cleanupret from %cleanup unwind to caller
+
+unreachable:                                      ; preds = %invoke.cont1, %invoke.cont
+  unreachable
+}
+
+declare i32 @__CxxFrameHandler3(...)
+
+declare void @throw()
+
+declare void @release(i64)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
new file mode 100644
index 00000000000..64e0f7be7e2
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define <2 x float> @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SOURCE:%.*]] = insertelement <2 x float> undef, float undef, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x float> [[SOURCE]], [[SOURCE]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <2 x float> [[RES1]], float [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x float> [[RES2]]
+;
+entry:
+  %source = insertelement <2 x float> undef, float undef, i32 0
+  %e0 = extractelement <2 x float> %source, i32 0
+  %e0.dup = extractelement <2 x float> %source, i32 0
+  %sub1 = fsub float %e0, %e0.dup
+  %e1 = extractelement <2 x float> %source, i32 1
+  %e1.dup = extractelement <2 x float> %source, i32 1
+  %sub2 = fsub float %e1, %e1.dup
+  %res1 = insertelement <2 x float> undef, float %sub1, i32 0
+  %res2 = insertelement <2 x float> %res1, float %sub2, i32 1
+  ret <2 x float> %res2
+}
+
+!llvm.ident = !{!0, !0}
+
+!0 = !{!"clang version 4.0.0 "}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
new file mode 100644
index 00000000000..bdb37b28d58
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+%class.1 = type { %class.2 }
+%class.2 = type { %"class.3" }
+%"class.3" = type { %"struct.1", i64 }
+%"struct.1" = type { [8 x i64] }
+
+$_ZN1C10SwitchModeEv = comdat any
+
+; Function Attrs: uwtable
+define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 {
+; CHECK-LABEL: @_ZN1C10SwitchModeEv(
+; CHECK-NEXT:  for.body.lr.ph.i:
+; CHECK-NEXT:    [[OR_1:%.*]] = or i64 undef, 1
+; CHECK-NEXT:    store i64 [[OR_1]], i64* undef, align 8
+; CHECK-NEXT:    [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+; CHECK-NEXT:    [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[BAR5:%.*]] = load i64, i64* undef, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+; CHECK-NEXT:    [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+for.body.lr.ph.i:
+  %or.1 = or i64 undef, 1
+  store i64 %or.1, i64* undef, align 8
+  %foo.1 = getelementptr inbounds %class.1, %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0
+  %foo.3 = load i64, i64* %foo.1, align 8
+  %foo.2 = getelementptr inbounds %class.1, %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1
+  %foo.4 = load i64, i64* %foo.2, align 8
+  %bar5 = load i64, i64* undef, align 8
+  %and.2 = and i64 %or.1, %foo.3
+  %and.1 = and i64 %bar5, %foo.4
+  %bar3 = getelementptr inbounds %class.2, %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0
+  store i64 %and.2, i64* %bar3, align 8
+  %bar4 = getelementptr inbounds %class.2, %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1
+  store i64 %and.1, i64* %bar4, align 8
+  ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @pr35497() local_unnamed_addr #0 {
+; CHECK-LABEL: @pr35497(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* undef, align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 undef, undef
+; CHECK-NEXT:    store i64 [[ADD]], i64* undef, align 1
+; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
+; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], <i64 6, i64 6>
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, i64* undef, align 1
+  %and = shl i64 %0, 2
+  %shl = and i64 %and, 20
+  %add = add i64 undef, undef
+  store i64 %add, i64* undef, align 1
+  %arrayidx2.1 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
+  %and.1 = shl i64 undef, 2
+  %shl.1 = and i64 %and.1, 20
+  %shr.1 = lshr i64 undef, 6
+  %add.1 = add nuw nsw i64 %shl, %shr.1
+  %arrayidx2.2 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
+  %shr.2 = lshr i64 undef, 6
+  %add.2 = add nuw nsw i64 %shl.1, %shr.2
+  %and.4 = shl i64 %add, 2
+  %shl.4 = and i64 %and.4, 20
+  %arrayidx2.5 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
+  store i64 %add.1, i64* %arrayidx2.5, align 1
+  %and.5 = shl nuw nsw i64 %add.1, 2
+  %shl.5 = and i64 %and.5, 20
+  %shr.5 = lshr i64 %add.1, 6
+  %add.5 = add nuw nsw i64 %shl.4, %shr.5
+  store i64 %add.5, i64* %arrayidx2.1, align 1
+  %arrayidx2.6 = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
+  store i64 %add.2, i64* %arrayidx2.6, align 1
+  %shr.6 = lshr i64 %add.2, 6
+  %add.6 = add nuw nsw i64 %shl.5, %shr.6
+  store i64 %add.6, i64* %arrayidx2.2, align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll b/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
new file mode 100644
index 00000000000..380f58fe5dc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
@@ -0,0 +1,607 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+
+; Check propagation of optional IR flags (PR20802). For a flag to
+; propagate from scalar instructions to their vector replacement,
+; *all* scalar instructions must have the flag.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+define void @exact(i32* %x) {
+; CHECK-LABEL: @exact(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr exact <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = lshr exact i32 %load1, 1
+  %op2 = lshr exact i32 %load2, 1
+  %op3 = lshr exact i32 %load3, 1
+  %op4 = lshr exact i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @not_exact(i32* %x) {
+; CHECK-LABEL: @not_exact(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = lshr exact i32 %load1, 1
+  %op2 = lshr i32 %load2, 1
+  %op3 = lshr exact i32 %load3, 1
+  %op4 = lshr exact i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @nsw(i32* %x) {
+; CHECK-LABEL: @nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = add nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = add nsw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @not_nsw(i32* %x) {
+; CHECK-LABEL: @not_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = add nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = add i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @nuw(i32* %x) {
+; CHECK-LABEL: @nuw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nuw i32 %load1, 1
+  %op2 = add nuw i32 %load2, 1
+  %op3 = add nuw i32 %load3, 1
+  %op4 = add nuw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @not_nuw(i32* %x) {
+; CHECK-LABEL: @not_nuw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nuw i32 %load1, 1
+  %op2 = add i32 %load2, 1
+  %op3 = add i32 %load3, 1
+  %op4 = add nuw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @nnan(float* %x) {
+; CHECK-LABEL: @nnan(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd nnan <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd fast nnan float %load1, 1.0
+  %op2 = fadd nnan ninf float %load2, 1.0
+  %op3 = fadd nsz nnan float %load3, 1.0
+  %op4 = fadd arcp nnan float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @not_nnan(float* %x) {
+; CHECK-LABEL: @not_nnan(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd nnan float %load1, 1.0
+  %op2 = fadd ninf float %load2, 1.0
+  %op3 = fadd nsz float %load3, 1.0
+  %op4 = fadd arcp float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @only_fast(float* %x) {
+; CHECK-LABEL: @only_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd fast nnan float %load1, 1.0
+  %op2 = fadd fast nnan ninf float %load2, 1.0
+  %op3 = fadd fast nsz nnan float %load3, 1.0
+  %op4 = fadd arcp nnan fast float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @only_arcp(float* %x) {
+; CHECK-LABEL: @only_arcp(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd arcp <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds float, float* %x, i64 0
+  %idx2 = getelementptr inbounds float, float* %x, i64 1
+  %idx3 = getelementptr inbounds float, float* %x, i64 2
+  %idx4 = getelementptr inbounds float, float* %x, i64 3
+
+  %load1 = load float, float* %idx1, align 4
+  %load2 = load float, float* %idx2, align 4
+  %load3 = load float, float* %idx3, align 4
+  %load4 = load float, float* %idx4, align 4
+
+  %op1 = fadd fast float %load1, 1.0
+  %op2 = fadd fast float %load2, 1.0
+  %op3 = fadd fast float %load3, 1.0
+  %op4 = fadd arcp float %load4, 1.0
+
+  store float %op1, float* %idx1, align 4
+  store float %op2, float* %idx2, align 4
+  store float %op3, float* %idx3, align 4
+  store float %op4, float* %idx4, align 4
+
+  ret void
+}
+
+define void @addsub_all_nsw(i32* %x) {
+; CHECK-LABEL: @addsub_all_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub nsw i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @addsub_some_nsw(i32* %x) {
+; CHECK-LABEL: @addsub_some_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add nsw i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @addsub_no_nsw(i32* %x) {
+; CHECK-LABEL: @addsub_no_nsw(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds i32, i32* %x, i64 0
+  %idx2 = getelementptr inbounds i32, i32* %x, i64 1
+  %idx3 = getelementptr inbounds i32, i32* %x, i64 2
+  %idx4 = getelementptr inbounds i32, i32* %x, i64 3
+
+  %load1 = load i32, i32* %idx1, align 4
+  %load2 = load i32, i32* %idx2, align 4
+  %load3 = load i32, i32* %idx3, align 4
+  %load4 = load i32, i32* %idx4, align 4
+
+  %op1 = add i32 %load1, 1
+  %op2 = sub nsw i32 %load2, 1
+  %op3 = add nsw i32 %load3, 1
+  %op4 = sub i32 %load4, 1
+
+  store i32 %op1, i32* %idx1, align 4
+  store i32 %op2, i32* %idx2, align 4
+  store i32 %op3, i32* %idx3, align 4
+  store i32 %op4, i32* %idx4, align 4
+
+  ret void
+}
+
+define void @fcmp_fast(double* %x) #1 {
+; CHECK-LABEL: @fcmp_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast oge <2 x double> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP2]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %cmp1 = fcmp fast oge double %load1, 0.000000e+00
+  %cmp2 = fcmp fast oge double %load2, 0.000000e+00
+
+  %sub1 = fsub fast double -0.000000e+00, %load1
+  %sub2 = fsub fast double -0.000000e+00, %load2
+
+  %sel1 = select i1 %cmp1, double %load1, double %sub1
+  %sel2 = select i1 %cmp2, double %load2, double %sub2
+
+  store double %sel1, double* %idx1, align 8
+  store double %sel2, double* %idx2, align 8
+
+  ret void
+}
+
+define void @fcmp_no_fast(double* %x) #1 {
+; CHECK-LABEL: @fcmp_no_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp oge <2 x double> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP2]], <2 x double> [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %cmp1 = fcmp fast oge double %load1, 0.000000e+00
+  %cmp2 = fcmp oge double %load2, 0.000000e+00
+
+  %sub1 = fsub fast double -0.000000e+00, %load1
+  %sub2 = fsub double -0.000000e+00, %load2
+
+  %sel1 = select i1 %cmp1, double %load1, double %sub1
+  %sel2 = select i1 %cmp2, double %load2, double %sub2
+
+  store double %sel1, double* %idx1, align 8
+  store double %sel2, double* %idx2, align 8
+
+  ret void
+}
+
+declare double @llvm.fabs.f64(double) nounwind readnone
+
+define void @call_fast(double* %x) {
+; CHECK-LABEL: @call_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %call1 = tail call fast double @llvm.fabs.f64(double %load1) nounwind readnone
+  %call2 = tail call fast double @llvm.fabs.f64(double %load2) nounwind readnone
+
+  store double %call1, double* %idx1, align 8
+  store double %call2, double* %idx2, align 8
+
+  ret void
+}
+
+define void @call_no_fast(double* %x) {
+; CHECK-LABEL: @call_no_fast(
+; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 0
+; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
+; CHECK-NEXT:    ret void
+;
+  %idx1 = getelementptr inbounds double, double* %x, i64 0
+  %idx2 = getelementptr inbounds double, double* %x, i64 1
+
+  %load1 = load double, double* %idx1, align 8
+  %load2 = load double, double* %idx2, align 8
+
+  %call1 = tail call fast double @llvm.fabs.f64(double %load1) nounwind readnone
+  %call2 = tail call double @llvm.fabs.f64(double %load2) nounwind readnone
+
+  store double %call1, double* %idx1, align 8
+  store double %call2, double* %idx2, align 8
+
+  ret void
+}
+
+attributes #1 = { "target-features"="+avx" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
new file mode 100644
index 00000000000..13bbe5e5798
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -reassociate -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+define signext i8 @Foo(<32 x i8>* %__v) {
+; CHECK-LABEL: @Foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, <32 x i8>* [[__V:%.*]], align 32
+; CHECK-NEXT:    [[ADD_I_1_I:%.*]] = add i8 undef, undef
+; CHECK-NEXT:    [[ADD_I_2_I:%.*]] = add i8 [[ADD_I_1_I]], undef
+; CHECK-NEXT:    [[ADD_I_3_I:%.*]] = add i8 [[ADD_I_2_I]], undef
+; CHECK-NEXT:    [[ADD_I_4_I:%.*]] = add i8 [[ADD_I_3_I]], undef
+; CHECK-NEXT:    [[ADD_I_5_I:%.*]] = add i8 [[ADD_I_4_I]], undef
+; CHECK-NEXT:    [[ADD_I_6_I:%.*]] = add i8 [[ADD_I_5_I]], undef
+; CHECK-NEXT:    [[ADD_I_7_I:%.*]] = add i8 [[ADD_I_6_I]], undef
+; CHECK-NEXT:    [[ADD_I_8_I:%.*]] = add i8 [[ADD_I_7_I]], undef
+; CHECK-NEXT:    [[ADD_I_9_I:%.*]] = add i8 [[ADD_I_8_I]], undef
+; CHECK-NEXT:    [[ADD_I_10_I:%.*]] = add i8 [[ADD_I_9_I]], undef
+; CHECK-NEXT:    [[ADD_I_11_I:%.*]] = add i8 [[ADD_I_10_I]], undef
+; CHECK-NEXT:    [[ADD_I_12_I:%.*]] = add i8 [[ADD_I_11_I]], undef
+; CHECK-NEXT:    [[ADD_I_13_I:%.*]] = add i8 [[ADD_I_12_I]], undef
+; CHECK-NEXT:    [[ADD_I_14_I:%.*]] = add i8 [[ADD_I_13_I]], undef
+; CHECK-NEXT:    [[ADD_I_15_I:%.*]] = add i8 [[ADD_I_14_I]], undef
+; CHECK-NEXT:    [[ADD_I_16_I:%.*]] = add i8 [[ADD_I_15_I]], undef
+; CHECK-NEXT:    [[ADD_I_17_I:%.*]] = add i8 [[ADD_I_16_I]], undef
+; CHECK-NEXT:    [[ADD_I_18_I:%.*]] = add i8 [[ADD_I_17_I]], undef
+; CHECK-NEXT:    [[ADD_I_19_I:%.*]] = add i8 [[ADD_I_18_I]], undef
+; CHECK-NEXT:    [[ADD_I_20_I:%.*]] = add i8 [[ADD_I_19_I]], undef
+; CHECK-NEXT:    [[ADD_I_21_I:%.*]] = add i8 [[ADD_I_20_I]], undef
+; CHECK-NEXT:    [[ADD_I_22_I:%.*]] = add i8 [[ADD_I_21_I]], undef
+; CHECK-NEXT:    [[ADD_I_23_I:%.*]] = add i8 [[ADD_I_22_I]], undef
+; CHECK-NEXT:    [[ADD_I_24_I:%.*]] = add i8 [[ADD_I_23_I]], undef
+; CHECK-NEXT:    [[ADD_I_25_I:%.*]] = add i8 [[ADD_I_24_I]], undef
+; CHECK-NEXT:    [[ADD_I_26_I:%.*]] = add i8 [[ADD_I_25_I]], undef
+; CHECK-NEXT:    [[ADD_I_27_I:%.*]] = add i8 [[ADD_I_26_I]], undef
+; CHECK-NEXT:    [[ADD_I_28_I:%.*]] = add i8 [[ADD_I_27_I]], undef
+; CHECK-NEXT:    [[ADD_I_29_I:%.*]] = add i8 [[ADD_I_28_I]], undef
+; CHECK-NEXT:    [[ADD_I_30_I:%.*]] = add i8 [[ADD_I_29_I]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <32 x i8> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i8> [[BIN_RDX]], <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <32 x i8> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i8> [[BIN_RDX2]], <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <32 x i8> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i8> [[BIN_RDX4]], <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <32 x i8> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i8> [[BIN_RDX6]], <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <32 x i8> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <32 x i8> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    [[ADD_I_31_I:%.*]] = add i8 [[ADD_I_30_I]], undef
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+entry:
+  %0 = load <32 x i8>, <32 x i8>* %__v, align 32
+  %vecext.i.i.i = extractelement <32 x i8> %0, i64 0
+  %vecext.i.i.1.i = extractelement <32 x i8> %0, i64 1
+  %add.i.1.i = add i8 %vecext.i.i.1.i, %vecext.i.i.i
+  %vecext.i.i.2.i = extractelement <32 x i8> %0, i64 2
+  %add.i.2.i = add i8 %vecext.i.i.2.i, %add.i.1.i
+  %vecext.i.i.3.i = extractelement <32 x i8> %0, i64 3
+  %add.i.3.i = add i8 %vecext.i.i.3.i, %add.i.2.i
+  %vecext.i.i.4.i = extractelement <32 x i8> %0, i64 4
+  %add.i.4.i = add i8 %vecext.i.i.4.i, %add.i.3.i
+  %vecext.i.i.5.i = extractelement <32 x i8> %0, i64 5
+  %add.i.5.i = add i8 %vecext.i.i.5.i, %add.i.4.i
+  %vecext.i.i.6.i = extractelement <32 x i8> %0, i64 6
+  %add.i.6.i = add i8 %vecext.i.i.6.i, %add.i.5.i
+  %vecext.i.i.7.i = extractelement <32 x i8> %0, i64 7
+  %add.i.7.i = add i8 %vecext.i.i.7.i, %add.i.6.i
+  %vecext.i.i.8.i = extractelement <32 x i8> %0, i64 8
+  %add.i.8.i = add i8 %vecext.i.i.8.i, %add.i.7.i
+  %vecext.i.i.9.i = extractelement <32 x i8> %0, i64 9
+  %add.i.9.i = add i8 %vecext.i.i.9.i, %add.i.8.i
+  %vecext.i.i.10.i = extractelement <32 x i8> %0, i64 10
+  %add.i.10.i = add i8 %vecext.i.i.10.i, %add.i.9.i
+  %vecext.i.i.11.i = extractelement <32 x i8> %0, i64 11
+  %add.i.11.i = add i8 %vecext.i.i.11.i, %add.i.10.i
+  %vecext.i.i.12.i = extractelement <32 x i8> %0, i64 12
+  %add.i.12.i = add i8 %vecext.i.i.12.i, %add.i.11.i
+  %vecext.i.i.13.i = extractelement <32 x i8> %0, i64 13
+  %add.i.13.i = add i8 %vecext.i.i.13.i, %add.i.12.i
+  %vecext.i.i.14.i = extractelement <32 x i8> %0, i64 14
+  %add.i.14.i = add i8 %vecext.i.i.14.i, %add.i.13.i
+  %vecext.i.i.15.i = extractelement <32 x i8> %0, i64 15
+  %add.i.15.i = add i8 %vecext.i.i.15.i, %add.i.14.i
+  %vecext.i.i.16.i = extractelement <32 x i8> %0, i64 16
+  %add.i.16.i = add i8 %vecext.i.i.16.i, %add.i.15.i
+  %vecext.i.i.17.i = extractelement <32 x i8> %0, i64 17
+  %add.i.17.i = add i8 %vecext.i.i.17.i, %add.i.16.i
+  %vecext.i.i.18.i = extractelement <32 x i8> %0, i64 18
+  %add.i.18.i = add i8 %vecext.i.i.18.i, %add.i.17.i
+  %vecext.i.i.19.i = extractelement <32 x i8> %0, i64 19
+  %add.i.19.i = add i8 %vecext.i.i.19.i, %add.i.18.i
+  %vecext.i.i.20.i = extractelement <32 x i8> %0, i64 20
+  %add.i.20.i = add i8 %vecext.i.i.20.i, %add.i.19.i
+  %vecext.i.i.21.i = extractelement <32 x i8> %0, i64 21
+  %add.i.21.i = add i8 %vecext.i.i.21.i, %add.i.20.i
+  %vecext.i.i.22.i = extractelement <32 x i8> %0, i64 22
+  %add.i.22.i = add i8 %vecext.i.i.22.i, %add.i.21.i
+  %vecext.i.i.23.i = extractelement <32 x i8> %0, i64 23
+  %add.i.23.i = add i8 %vecext.i.i.23.i, %add.i.22.i
+  %vecext.i.i.24.i = extractelement <32 x i8> %0, i64 24
+  %add.i.24.i = add i8 %vecext.i.i.24.i, %add.i.23.i
+  %vecext.i.i.25.i = extractelement <32 x i8> %0, i64 25
+  %add.i.25.i = add i8 %vecext.i.i.25.i, %add.i.24.i
+  %vecext.i.i.26.i = extractelement <32 x i8> %0, i64 26
+  %add.i.26.i = add i8 %vecext.i.i.26.i, %add.i.25.i
+  %vecext.i.i.27.i = extractelement <32 x i8> %0, i64 27
+  %add.i.27.i = add i8 %vecext.i.i.27.i, %add.i.26.i
+  %vecext.i.i.28.i = extractelement <32 x i8> %0, i64 28
+  %add.i.28.i = add i8 %vecext.i.i.28.i, %add.i.27.i
+  %vecext.i.i.29.i = extractelement <32 x i8> %0, i64 29
+  %add.i.29.i = add i8 %vecext.i.i.29.i, %add.i.28.i
+  %vecext.i.i.30.i = extractelement <32 x i8> %0, i64 30
+  %add.i.30.i = add i8 %vecext.i.i.30.i, %add.i.29.i
+  %vecext.i.i.31.i = extractelement <32 x i8> %0, i64 31
+  %add.i.31.i = add i8 %vecext.i.i.31.i, %add.i.30.i
+  ret i8 %add.i.31.i
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
new file mode 100644
index 00000000000..e9f8e7f7c88
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+; int foo(double *A, int n, int m) {
+;   double sum = 0, v1 = 2, v0 = 3;
+;   for (int i=0; i < n; ++i)
+;     sum += 7*A[i*2] + 7*A[i*2+1];
+;   return sum;
+; }
+
+define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) {
+; CHECK-LABEL: @reduce(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp sgt i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SUM_014:%.*]] = phi double [ [[ADD6:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i32 [[MUL]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 7.000000e+00, double 7.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[ADD6]] = fadd double [[SUM_014]], [[ADD5]]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fptosi double [[ADD6]] to i32
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  %cmp13 = icmp sgt i32 %n, 0
+  br i1 %cmp13, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %sum.014 = phi double [ %add6, %for.body ], [ 0.000000e+00, %entry ]
+  %mul = shl nsw i32 %i.015, 1
+  %arrayidx = getelementptr inbounds double, double* %A, i32 %mul
+  %0 = load double, double* %arrayidx, align 4
+  %mul1 = fmul double %0, 7.000000e+00
+  %add12 = or i32 %mul, 1
+  %arrayidx3 = getelementptr inbounds double, double* %A, i32 %add12
+  %1 = load double, double* %arrayidx3, align 4
+  %mul4 = fmul double %1, 7.000000e+00
+  %add5 = fadd double %mul1, %mul4
+  %add6 = fadd double %sum.014, %add5
+  %inc = add nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:                       ; preds = %for.body
+  %phitmp = fptosi double %add6 to i32
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  %sum.0.lcssa = phi i32 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  ret i32 %sum.0.lcssa
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
new file mode 100644
index 00000000000..87a6af792f5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define double @foo(double* nocapture %D) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    br label [[TMP1:%.*]]
+; CHECK:         [[I_02:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP12:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[SUM_01:%.*]] = phi double [ 0.000000e+00, [[TMP0]] ], [ [[TMP11:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i32 [[I_02]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd double [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11]] = fadd double [[SUM_01]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12]] = add nsw i32 [[I_02]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP12]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[TMP13:%.*]], label [[TMP1]]
+; CHECK:         ret double [[TMP11]]
+;
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %i.02 = phi i32 [ 0, %0 ], [ %10, %1 ]
+  %sum.01 = phi double [ 0.000000e+00, %0 ], [ %9, %1 ]
+  %2 = shl nsw i32 %i.02, 1
+  %3 = getelementptr inbounds double, double* %D, i32 %2
+  %4 = load double, double* %3, align 4
+  %A4 = fmul double %4, %4
+  %A42 = fmul double %A4, %A4
+  %5 = or i32 %2, 1
+  %6 = getelementptr inbounds double, double* %D, i32 %5
+  %7 = load double, double* %6, align 4
+  %A7 = fmul double %7, %7
+  %A72 = fmul double %A7, %A7
+  %8 = fadd double %A42, %A72
+  %9 = fadd double %sum.01, %8
+  %10 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %10, 100
+  br i1 %exitcond, label %11, label %1
+
+; <label>:11                                      ; preds = %1
+  ret double %9
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
new file mode 100644
index 00000000000..0f0bbf9a2ad
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.10.0 -mattr=+sse4.2 | FileCheck %s
+
+
+define i32 @test(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 undef, [[SUM]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 undef, [[ADD_4]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 undef, [[ADD_5]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
+;
+entry:
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  br label %for.body
+
+for.body:
+  %sum = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
+  %tmp = load i32, i32* %p, align 4
+  %mul = mul i32 %tmp, 42
+  %add = add i32 %mul, %sum
+  %tmp5 = load i32, i32* %arrayidx.1, align 4
+  %mul.1 = mul i32 %tmp5, 42
+  %add.1 = add i32 %mul.1, %add
+  %tmp6 = load i32, i32* %arrayidx.2, align 4
+  %mul.2 = mul i32 %tmp6, 42
+  %add.2 = add i32 %mul.2, %add.1
+  %tmp7 = load i32, i32* %arrayidx.3, align 4
+  %mul.3 = mul i32 %tmp7, 42
+  %add.3 = add i32 %mul.3, %add.2
+  %tmp8 = load i32, i32* %arrayidx.4, align 4
+  %mul.4 = mul i32 %tmp8, 42
+  %add.4 = add i32 %mul.4, %add.3
+  %tmp9 = load i32, i32* %arrayidx.5, align 4
+  %mul.5 = mul i32 %tmp9, 42
+  %add.5 = add i32 %mul.5, %add.4
+  %tmp10 = load i32, i32* %arrayidx.6, align 4
+  %mul.6 = mul i32 %tmp10, 42
+  %add.6 = add i32 %mul.6, %add.5
+  %tmp11 = load i32, i32* %arrayidx.7, align 4
+  %mul.7 = mul i32 %tmp11, 42
+  %add.7 = add i32 %mul.7, %add.6
+  br i1 true, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add.7
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
new file mode 100644
index 00000000000..4dd40876a70
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll
@@ -0,0 +1,352 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -S -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -debug < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,SSE
+; REQUIRES: asserts
+
+; int test_add(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result += p[i];
+;   return result;
+; }
+
+; Vector cost is 5, Scalar cost is 7
+; AVX: Adding cost -2 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+; Vector cost is 6, Scalar cost is 7
+; SSE: Adding cost -1 for reduction that starts with   %7 = load i32, i32* %arrayidx.7, align 4 (It is a splitting reduction)
+define i32 @test_add(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = add i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = add i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = add i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = add i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = add i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = add i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = add i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = add i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = add i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = add i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = add i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = add i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = add i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = add i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_mul(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result *= p[i];
+;   return result;
+; }
+
+define i32 @test_mul(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4
+; CHECK-NEXT:    [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4
+; CHECK-NEXT:    [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4
+; CHECK-NEXT:    [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4
+; CHECK-NEXT:    [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4
+; CHECK-NEXT:    [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4
+; CHECK-NEXT:    [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[MUL_714]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = mul i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = mul i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = mul i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = mul i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = mul i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = mul i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = mul i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_and(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result &= p[i];
+;   return result;
+; }
+
+define i32 @test_and(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = and i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = and i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = and i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = and i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = and i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = and i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = and <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = and i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = and i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = and i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = and i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = and i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = and i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = and i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = and i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_or(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result |= p[i];
+;   return result;
+; }
+
+define i32 @test_or(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = or i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = or i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = or i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = or i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = or i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = or i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = or i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = or i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = or i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = or i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = or i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = or i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = or i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = or i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+; int test_xor(unsigned int *p) {
+;   int result = 0;
+;   for (int i = 0; i < 8; i++)
+;     result ^= p[i];
+;   return result;
+; }
+
+define i32 @test_xor(i32* nocapture readonly %p) {
+; CHECK-LABEL: @test_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[MUL_18:%.*]] = xor i32 undef, undef
+; CHECK-NEXT:    [[MUL_29:%.*]] = xor i32 undef, [[MUL_18]]
+; CHECK-NEXT:    [[MUL_310:%.*]] = xor i32 undef, [[MUL_29]]
+; CHECK-NEXT:    [[MUL_411:%.*]] = xor i32 undef, [[MUL_310]]
+; CHECK-NEXT:    [[MUL_512:%.*]] = xor i32 undef, [[MUL_411]]
+; CHECK-NEXT:    [[MUL_613:%.*]] = xor i32 undef, [[MUL_512]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = xor <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[MUL_714:%.*]] = xor i32 undef, [[MUL_613]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %0 = load i32, i32* %p, align 4
+  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = load i32, i32* %arrayidx.1, align 4
+  %mul.18 = xor i32 %1, %0
+  %arrayidx.2 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = load i32, i32* %arrayidx.2, align 4
+  %mul.29 = xor i32 %2, %mul.18
+  %arrayidx.3 = getelementptr inbounds i32, i32* %p, i64 3
+  %3 = load i32, i32* %arrayidx.3, align 4
+  %mul.310 = xor i32 %3, %mul.29
+  %arrayidx.4 = getelementptr inbounds i32, i32* %p, i64 4
+  %4 = load i32, i32* %arrayidx.4, align 4
+  %mul.411 = xor i32 %4, %mul.310
+  %arrayidx.5 = getelementptr inbounds i32, i32* %p, i64 5
+  %5 = load i32, i32* %arrayidx.5, align 4
+  %mul.512 = xor i32 %5, %mul.411
+  %arrayidx.6 = getelementptr inbounds i32, i32* %p, i64 6
+  %6 = load i32, i32* %arrayidx.6, align 4
+  %mul.613 = xor i32 %6, %mul.512
+  %arrayidx.7 = getelementptr inbounds i32, i32* %p, i64 7
+  %7 = load i32, i32* %arrayidx.7, align 4
+  %mul.714 = xor i32 %7, %mul.613
+  ret i32 %mul.714
+}
+
+define i32 @PR37731(<4 x i32>* noalias nocapture dereferenceable(16) %self) unnamed_addr #0 {
+; CHECK-LABEL: @PR37731(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[SELF:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 6, i32 2, i32 13, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr <4 x i32> [[TMP2]], <i32 13, i32 27, i32 21, i32 12>
+; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP0]], <i32 -2, i32 -8, i32 -16, i32 -128>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP4]], <i32 18, i32 2, i32 7, i32 13>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[SELF]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 undef, undef
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = xor <4 x i32> [[TMP6]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = xor <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], undef
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %self, align 16
+  %1 = shl <4 x i32> %0, <i32 6, i32 2, i32 13, i32 3>
+  %2 = xor <4 x i32> %1, %0
+  %3 = lshr <4 x i32> %2, <i32 13, i32 27, i32 21, i32 12>
+  %4 = and <4 x i32> %0, <i32 -2, i32 -8, i32 -16, i32 -128>
+  %5 = shl <4 x i32> %4, <i32 18, i32 2, i32 7, i32 13>
+  %6 = xor <4 x i32> %3, %5
+  store <4 x i32> %6, <4 x i32>* %self, align 16
+  %7 = extractelement <4 x i32> %6, i32 0
+  %8 = extractelement <4 x i32> %6, i32 1
+  %9 = xor i32 %7, %8
+  %10 = extractelement <4 x i32> %6, i32 2
+  %11 = xor i32 %9, %10
+  %12 = extractelement <4 x i32> %6, i32 3
+  %13 = xor i32 %11, %12
+  ret i32 %13
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
new file mode 100644
index 00000000000..27997f6af3d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @foo(i32* %diff) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[M2:%.*]] = alloca [8 x [8 x i32]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [8 x [8 x i32]]* [[M2]] to i8*
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[TMP1]], 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[TMP1]], 5
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 2
+; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP1]], 6
+; CHECK-NEXT:    [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 3
+; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP1]], 7
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 undef, [[A_088]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
+; CHECK-NEXT:    [[ADD24:%.*]] = add nsw i32 [[ADD10]], undef
+; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2
+; CHECK-NEXT:    [[ADD38:%.*]] = add nsw i32 [[ADD24]], undef
+; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP13]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[OP_EXTRA]] = add nsw i32 [[TMP15]], [[A_088]]
+; CHECK-NEXT:    [[ADD52:%.*]] = add nsw i32 [[ADD38]], undef
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
+;
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+
+  %add24 = add nsw i32 %add10, %add17
+  %9 = or i64 %1, 2
+  %arrayidx27 = getelementptr inbounds i32, i32* %diff, i64 %9
+  %10 = load i32, i32* %arrayidx27, align 4
+  %11 = or i64 %1, 6
+  %arrayidx30 = getelementptr inbounds i32, i32* %diff, i64 %11
+  %12 = load i32, i32* %arrayidx30, align 4
+  %add31 = add nsw i32 %12, %10
+  %arrayidx34 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
+  store i32 %add31, i32* %arrayidx34, align 8
+
+  %add38 = add nsw i32 %add24, %add31
+  %13 = or i64 %1, 3
+  %arrayidx41 = getelementptr inbounds i32, i32* %diff, i64 %13
+  %14 = load i32, i32* %arrayidx41, align 4
+  %15 = or i64 %1, 7
+  %arrayidx44 = getelementptr inbounds i32, i32* %diff, i64 %15
+  %16 = load i32, i32* %arrayidx44, align 4
+
+  %add45 = add nsw i32 %16, %14
+  %arrayidx48 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
+  store i32 %add45, i32* %arrayidx48, align 4
+
+  %add52 = add nsw i32 %add38, %add45
+
+  ; YAML:      --- !Passed
+  ; YAML-NEXT: Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            StoresVectorized
+  ; YAML-NEXT: Function:        foo
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Stores SLP vectorized with cost '
+  ; YAML-NEXT:   - Cost:            '-8'
+  ; YAML-NEXT:   - String:          ' and with tree size '
+  ; YAML-NEXT:   - TreeSize:        '4'
+
+  ; YAML:      --- !Passed
+  ; YAML-NEXT: Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            VectorizedHorizontalReduction
+  ; YAML-NEXT: Function:        foo
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
+  ; YAML-NEXT:   - Cost:            '-2'
+  ; YAML-NEXT:   - String:          ' and with tree size '
+  ; YAML-NEXT:   - TreeSize:        '1'
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add52
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_listcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_listcost.ll
new file mode 100644
index 00000000000..ed2cd870cf6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_listcost.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define void @vsub2_test(i32* %pin1, i32* %pin2, i32* %pout) #0 {
+; CHECK-LABEL: @vsub2_test(
+; CHECK-NEXT:    br label [[TMP1:%.*]]
+; CHECK:         [[IDX_04:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP8:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[PO_03:%.*]] = phi i32* [ [[POUT:%.*]], [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[PTMPI2_02:%.*]] = phi i32* [ [[PIN2:%.*]], [[TMP0]] ], [ [[TMP4:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[PTMPI1_01:%.*]] = phi i32* [ [[PIN1:%.*]], [[TMP0]] ], [ [[TMP2:%.*]], [[TMP1]] ]
+; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds i32, i32* [[PTMPI1_01]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[PTMPI1_01]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP4]] = getelementptr inbounds i32, i32* [[PTMPI2_02]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[PTMPI2_02]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw i32 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7]] = getelementptr inbounds i32, i32* [[PO_03]], i64 1
+; CHECK-NEXT:    store i32 [[TMP6]], i32* [[PO_03]], align 4, !tbaa !1
+; CHECK-NEXT:    [[TMP8]] = add nuw nsw i32 [[IDX_04]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP8]], 64
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[TMP9:%.*]], label [[TMP1]], !llvm.loop !5
+; CHECK:         ret void
+;
+  br label %1
+
+  %idx.04 = phi i32 [ 0, %0 ], [ %8, %1 ]
+  %po.03 = phi i32* [ %pout, %0 ], [ %7, %1 ]
+  %ptmpi2.02 = phi i32* [ %pin2, %0 ], [ %4, %1 ]
+  %ptmpi1.01 = phi i32* [ %pin1, %0 ], [ %2, %1 ]
+  %2 = getelementptr inbounds i32, i32* %ptmpi1.01, i64 1
+  %3 = load i32, i32* %ptmpi1.01, align 4, !tbaa !1
+  %4 = getelementptr inbounds i32, i32* %ptmpi2.02, i64 1
+  %5 = load i32, i32* %ptmpi2.02, align 4, !tbaa !1
+  %6 = sub nsw i32 %3, %5
+  %7 = getelementptr inbounds i32, i32* %po.03, i64 1
+  ; YAML:      Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            NotBeneficial
+  ; YAML-NEXT: Function:        vsub2_test
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+  ; YAML-NEXT:   - Cost:            '0'
+  ; YAML-NEXT:   - String:          ' >= '
+  ; YAML-NEXT:   - Treshold:        '0'
+  store i32 %6, i32* %po.03, align 4, !tbaa !1
+  %8 = add nuw nsw i32 %idx.04, 1
+  %exitcond = icmp eq i32 %8, 64
+  br i1 %exitcond, label %9, label %1, !llvm.loop !5
+
+  ret void
+}
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = distinct !{!5, !6, !7}
+!6 = !{!"llvm.loop.vectorize.width", i32 1}
+!7 = !{!"llvm.loop.interleave.count", i32 1}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
new file mode 100644
index 00000000000..a0d3c939d4f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+define i32 @foo(i32* nocapture readonly %diff) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[M2:%.*]] = alloca [8 x [8 x i32]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast [8 x [8 x i32]]* [[M2]] to i8*
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD24:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0
+; CHECK-NEXT:    store i32 [[ADD3]], i32* [[ARRAYIDX6]], align 16
+; CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD3]], [[A_088]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP1]], 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP1]], 5
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4
+; CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP8]], [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1
+; CHECK-NEXT:    store i32 [[ADD17]], i32* [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[ADD24]] = add nsw i32 [[ADD10]], [[ADD17]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 0
+; CHECK-NEXT:    ret i32 [[ADD24]]
+;
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add24, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  %add24 = add nsw i32 %add10, %add17
+
+  ; YAML:      Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            NotPossible
+  ; YAML-NEXT: Function:        foo
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: vectorization was impossible'
+  ; YAML-NEXT:   - String:          ' with available vectorization factors'
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arraydecay = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 0
+  ret i32 %add24
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
new file mode 100644
index 00000000000..a134aec00bb
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_unsupported.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=x86_64-pc-linux-gnu -mcpu=generic -slp-vectorizer -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+; This type is not supported by SLP
+define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1_0:%.*]] = load x86_fp80, x86_fp80* [[I1:%.*]], align 16
+; CHECK-NEXT:    [[I1_GEP1:%.*]] = getelementptr x86_fp80, x86_fp80* [[I1]], i64 1
+; CHECK-NEXT:    [[I1_1:%.*]] = load x86_fp80, x86_fp80* [[I1_GEP1]], align 16
+; CHECK-NEXT:    br i1 undef, label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[I2_GEP0:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2:%.*]], i64 0
+; CHECK-NEXT:    [[I2_0:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP0]], align 16
+; CHECK-NEXT:    [[I2_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[I2]], i64 1
+; CHECK-NEXT:    [[I2_1:%.*]] = load x86_fp80, x86_fp80* [[I2_GEP1]], align 16
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI0:%.*]] = phi x86_fp80 [ [[I1_0]], [[ENTRY:%.*]] ], [ [[I2_0]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI1:%.*]] = phi x86_fp80 [ [[I1_1]], [[ENTRY]] ], [ [[I2_1]], [[THEN]] ]
+; CHECK-NEXT:    store x86_fp80 [[PHI0]], x86_fp80* [[O:%.*]], align 16
+; CHECK-NEXT:    [[O_GEP1:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[O]], i64 1
+; CHECK-NEXT:    store x86_fp80 [[PHI1]], x86_fp80* [[O_GEP1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i1.0 = load x86_fp80, x86_fp80* %i1, align 16
+  %i1.gep1 = getelementptr x86_fp80, x86_fp80* %i1, i64 1
+  %i1.1 = load x86_fp80, x86_fp80* %i1.gep1, align 16
+  br i1 undef, label %then, label %end
+then:
+  %i2.gep0 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 0
+  %i2.0 = load x86_fp80, x86_fp80* %i2.gep0, align 16
+  %i2.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %i2, i64 1
+  %i2.1 = load x86_fp80, x86_fp80* %i2.gep1, align 16
+  br label %end
+end:
+  %phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
+
+  %phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
+  store x86_fp80 %phi0, x86_fp80* %o, align 16
+  %o.gep1 = getelementptr inbounds x86_fp80, x86_fp80* %o, i64 1
+  store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
+  ; YAML:      Pass:            slp-vectorizer
+  ; YAML-NEXT: Name:            UnsupportedType
+  ; YAML-NEXT: Function:        test
+  ; YAML-NEXT: Args:
+  ; YAML-NEXT:   - String:          'Cannot SLP vectorize list: type '
+  ; YAML-NEXT:   - String:          x86_fp80 is unsupported by vectorizer
+
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
new file mode 100644
index 00000000000..afee26f0d40
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer  -S -mtriple=x86_64-unknown -mcpu=corei7-avx | FileCheck %s
+
+%struct.complex = type { float, float }
+
+define  void @foo (%struct.complex* %A, %struct.complex* %B, %struct.complex* %Result) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 256, 0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18]] = fadd float [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19]] = fadd float [[TMP2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store float [[TMP18]], float* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1
+; CHECK-NEXT:    store float [[TMP19]], float* [[TMP23]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = add i64 256, 0
+  br label %loop
+
+loop:
+  %1 = phi i64 [ 0, %entry ], [ %20, %loop ]
+  %2 = phi float [ 0.000000e+00, %entry ], [ %19, %loop ]
+  %3 = phi float [ 0.000000e+00, %entry ], [ %18, %loop ]
+  %4 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 0
+  %5 = load float, float* %4, align 4
+  %6 = getelementptr inbounds %"struct.complex", %"struct.complex"* %A, i64 %1, i32 1
+  %7 = load float, float* %6, align 4
+  %8 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 0
+  %9 = load float, float* %8, align 4
+  %10 = getelementptr inbounds %"struct.complex", %"struct.complex"* %B, i64 %1, i32 1
+  %11 = load float, float* %10, align 4
+  %12 = fmul float %5, %9
+  %13 = fmul float %7, %11
+  %14 = fsub float %12, %13
+  %15 = fmul float %7, %9
+  %16 = fmul float %5, %11
+  %17 = fadd float %15, %16
+  %18 = fadd float %3, %14
+  %19 = fadd float %2, %17
+  %20 = add nuw nsw i64 %1, 1
+  %21 = icmp eq i64 %20, %0
+  br i1 %21, label %exit, label %loop
+
+exit:
+  %22 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result,  i32 0, i32 0
+  store float %18, float* %22, align 4
+  %23 = getelementptr inbounds %"struct.complex", %"struct.complex"* %Result,  i32 0, i32 1
+  store float %19, float* %23, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
new file mode 100644
index 00000000000..13884efd98d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @hoge() {
+; CHECK-LABEL: @hoge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP:%.*]] = select i1 undef, i16 undef, i16 15
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[TMP]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>, [[REORDER_SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
+; CHECK-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], <i32 undef, i32 15, i32 31, i32 47>
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], undef
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], undef
+; CHECK-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP5]], <4 x i32> [[RDX_SHUF9]]
+; CHECK-NEXT:    [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP13:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT11]], [[RDX_SHUF12]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT14:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP13]], <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> [[RDX_SHUF12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], 63
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP27]], undef
+; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], i32 undef, i32 [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP31]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sgt i32 [[TMP32]], [[TMP29]]
+; CHECK-NEXT:    [[TMP34:%.*]] = select i1 [[TMP33]], i32 [[TMP29]], i32 [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP37:%.*]] = select i1 [[TMP36]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp sgt i32 [[TMP37]], [[TMP34]]
+; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[TMP34]], i32 [[TMP37]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp sgt i32 undef, undef
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 undef, i32 undef
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], [[TMP39]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP9]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP9]], <4 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA4:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA]], i32 undef
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef
+; CHECK-NEXT:    [[OP_EXTRA5:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA4]], i32 undef
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef
+; CHECK-NEXT:    [[OP_EXTRA6:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA5]], i32 undef
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = select i1 [[TMP15]], i32 [[OP_EXTRA6]], i32 undef
+; CHECK-NEXT:    [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP39]], i32 [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA7]]
+; CHECK-NEXT:    unreachable
+;
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb
+  ret void
+
+bb2:                                              ; preds = %bb
+  %tmp = select i1 undef, i16 undef, i16 15
+  %tmp3 = sext i16 undef to i32
+  %tmp4 = sext i16 %tmp to i32
+  %tmp5 = sub nsw i32 undef, %tmp4
+  %tmp6 = sub i32 %tmp5, undef
+  %tmp7 = sub nsw i32 63, %tmp3
+  %tmp8 = sub i32 %tmp7, undef
+  %tmp9 = add i32 %tmp8, undef
+  %tmp10 = add nsw i32 %tmp6, 15
+  %tmp11 = icmp sgt i32 %tmp9, %tmp10
+  %tmp12 = select i1 %tmp11, i32 %tmp9, i32 %tmp10
+  %tmp13 = add nsw i32 %tmp6, 31
+  %tmp14 = icmp sgt i32 %tmp12, %tmp13
+  %tmp15 = select i1 %tmp14, i32 %tmp12, i32 %tmp13
+  %tmp16 = add nsw i32 %tmp6, 47
+  %tmp17 = icmp sgt i32 %tmp15, %tmp16
+  %tmp18 = select i1 %tmp17, i32 %tmp15, i32 %tmp16
+  %tmp19 = select i1 undef, i32 %tmp18, i32 undef
+  %tmp20 = icmp sgt i32 %tmp19, 63
+  %tmp21 = sub nsw i32 undef, %tmp3
+  %tmp22 = sub i32 %tmp21, undef
+  %tmp23 = sub nsw i32 undef, %tmp4
+  %tmp24 = sub i32 %tmp23, undef
+  %tmp25 = add nsw i32 %tmp24, -49
+  %tmp26 = icmp sgt i32 %tmp25, undef
+  %tmp27 = select i1 %tmp26, i32 undef, i32 %tmp25
+  %tmp28 = icmp sgt i32 %tmp27, undef
+  %tmp29 = select i1 %tmp28, i32 undef, i32 %tmp27
+  %tmp30 = add nsw i32 %tmp22, -33
+  %tmp31 = icmp sgt i32 %tmp30, undef
+  %tmp32 = select i1 %tmp31, i32 undef, i32 %tmp30
+  %tmp33 = icmp sgt i32 %tmp32, %tmp29
+  %tmp34 = select i1 %tmp33, i32 %tmp29, i32 %tmp32
+  %tmp35 = add nsw i32 %tmp24, -33
+  %tmp36 = icmp sgt i32 %tmp35, undef
+  %tmp37 = select i1 %tmp36, i32 undef, i32 %tmp35
+  %tmp38 = icmp sgt i32 %tmp37, %tmp34
+  %tmp39 = select i1 %tmp38, i32 %tmp34, i32 %tmp37
+  %tmp40 = add nsw i32 %tmp22, -17
+  %tmp41 = icmp sgt i32 %tmp40, undef
+  %tmp42 = select i1 %tmp41, i32 undef, i32 %tmp40
+  %tmp43 = icmp sgt i32 %tmp42, %tmp39
+  %tmp44 = select i1 %tmp43, i32 %tmp39, i32 %tmp42
+  %tmp45 = icmp sgt i32 undef, %tmp44
+  unreachable
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
new file mode 100644
index 00000000000..28bc95e2f4c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+%"struct.std::array" = type { [32 x i8] }
+
+; Function Attrs: nounwind uwtable
+define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() unnamed_addr #0 align 2 {
+; CHECK-LABEL: @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]]
+; CHECK:       if.then22.i:
+; CHECK-NEXT:    [[SUB_I:%.*]] = add nsw i32 undef, -1
+; CHECK-NEXT:    [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_3_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_4_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], <i32 9, i32 10, i32 11, i32 12>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
+; CHECK-NEXT:    [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+; CHECK-NEXT:    [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
+; CHECK-NEXT:    [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12
+; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
+; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
+; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
+; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
+; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
+; CHECK-NEXT:    unreachable
+; CHECK:       if.end50.i:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %if.end50.i, label %if.then22.i
+
+if.then22.i:                                      ; preds = %entry
+  %sub.i = add nsw i32 undef, -1
+  %conv31.i = and i32 undef, %sub.i
+  %0 = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0
+  %1 = trunc i32 %sub.i to i8
+  %conv.i.i1199 = and i8 %1, 1
+  store i8 %conv.i.i1199, i8* %0, align 1
+  %shr.i.i = lshr i32 %conv31.i, 1
+  %2 = trunc i32 %shr.i.i to i8
+  %conv.1.i.i = and i8 %2, 1
+  %arrayidx.i.i7.1.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1
+  store i8 %conv.1.i.i, i8* %arrayidx.i.i7.1.i.i, align 1
+  %shr.1.i.i = lshr i32 %conv31.i, 2
+  %3 = trunc i32 %shr.1.i.i to i8
+  %conv.2.i.i = and i8 %3, 1
+  %arrayidx.i.i7.2.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2
+  store i8 %conv.2.i.i, i8* %arrayidx.i.i7.2.i.i, align 1
+  %shr.2.i.i = lshr i32 %conv31.i, 3
+  %4 = trunc i32 %shr.2.i.i to i8
+  %conv.3.i.i = and i8 %4, 1
+  %arrayidx.i.i7.3.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 3
+  store i8 %conv.3.i.i, i8* %arrayidx.i.i7.3.i.i, align 1
+  %shr.3.i.i = lshr i32 %conv31.i, 4
+  %5 = trunc i32 %shr.3.i.i to i8
+  %conv.4.i.i = and i8 %5, 1
+  %arrayidx.i.i7.4.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 4
+  store i8 %conv.4.i.i, i8* %arrayidx.i.i7.4.i.i, align 1
+  %shr.4.i.i = lshr i32 %conv31.i, 5
+  %6 = trunc i32 %shr.4.i.i to i8
+  %conv.5.i.i = and i8 %6, 1
+  %arrayidx.i.i7.5.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5
+  store i8 %conv.5.i.i, i8* %arrayidx.i.i7.5.i.i, align 1
+  %shr.5.i.i = lshr i32 %conv31.i, 6
+  %7 = trunc i32 %shr.5.i.i to i8
+  %conv.6.i.i = and i8 %7, 1
+  %arrayidx.i.i7.6.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6
+  store i8 %conv.6.i.i, i8* %arrayidx.i.i7.6.i.i, align 1
+  %shr.6.i.i = lshr i32 %conv31.i, 7
+  %8 = trunc i32 %shr.6.i.i to i8
+  %conv.7.i.i = and i8 %8, 1
+  %arrayidx.i.i7.7.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7
+  store i8 %conv.7.i.i, i8* %arrayidx.i.i7.7.i.i, align 1
+  %shr.7.i.i = lshr i32 %conv31.i, 8
+  %9 = trunc i32 %shr.7.i.i to i8
+  %conv.8.i.i = and i8 %9, 1
+  %arrayidx.i.i7.8.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8
+  store i8 %conv.8.i.i, i8* %arrayidx.i.i7.8.i.i, align 1
+  %shr.8.i.i = lshr i32 %conv31.i, 9
+  %10 = trunc i32 %shr.8.i.i to i8
+  %conv.9.i.i = and i8 %10, 1
+  %arrayidx.i.i7.9.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9
+  store i8 %conv.9.i.i, i8* %arrayidx.i.i7.9.i.i, align 1
+  %shr.9.i.i = lshr i32 %conv31.i, 10
+  %11 = trunc i32 %shr.9.i.i to i8
+  %conv.10.i.i = and i8 %11, 1
+  %arrayidx.i.i7.10.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10
+  store i8 %conv.10.i.i, i8* %arrayidx.i.i7.10.i.i, align 1
+  %shr.10.i.i = lshr i32 %conv31.i, 11
+  %12 = trunc i32 %shr.10.i.i to i8
+  %conv.11.i.i = and i8 %12, 1
+  %arrayidx.i.i7.11.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11
+  store i8 %conv.11.i.i, i8* %arrayidx.i.i7.11.i.i, align 1
+  %shr.11.i.i = lshr i32 %conv31.i, 12
+  %13 = trunc i32 %shr.11.i.i to i8
+  %conv.12.i.i = and i8 %13, 1
+  %arrayidx.i.i7.12.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12
+  store i8 %conv.12.i.i, i8* %arrayidx.i.i7.12.i.i, align 1
+  %shr.12.i.i = lshr i32 %conv31.i, 13
+  %14 = trunc i32 %shr.12.i.i to i8
+  %conv.13.i.i = and i8 %14, 1
+  %arrayidx.i.i7.13.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13
+  store i8 %conv.13.i.i, i8* %arrayidx.i.i7.13.i.i, align 1
+  %shr.13.i.i = lshr i32 %conv31.i, 14
+  %15 = trunc i32 %shr.13.i.i to i8
+  %conv.14.i.i = and i8 %15, 1
+  %arrayidx.i.i7.14.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14
+  store i8 %conv.14.i.i, i8* %arrayidx.i.i7.14.i.i, align 1
+  %shr.14.i.i = lshr i32 %conv31.i, 15
+  %16 = trunc i32 %shr.14.i.i to i8
+  %conv.15.i.i = and i8 %16, 1
+  %arrayidx.i.i7.15.i.i = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
+  store i8 %conv.15.i.i, i8* %arrayidx.i.i7.15.i.i, align 1
+  unreachable
+
+if.end50.i:                                       ; preds = %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/return.ll b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
new file mode 100644
index 00000000000..445dcba20d5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/return.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "x86_64--linux-gnu"
+
+@a = common global [4 x double] zeroinitializer, align 8
+@b = common global [4 x double] zeroinitializer, align 8
+
+; [4], b[4];
+; double foo() {
+;  double sum =0;
+;  sum = (a[0]+b[0]) + (a[1]+b[1]);
+;  return sum;
+; }
+
+define double @return1() {
+; CHECK-LABEL: @return1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([4 x double]* @a to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([4 x double]* @b to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret double [[ADD2]]
+;
+entry:
+  %a0 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @a, i32 0, i32 0), align 8
+  %b0 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @b, i32 0, i32 0), align 8
+  %add0 = fadd double %a0, %b0
+  %a1 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @a, i32 0, i32 1), align 8
+  %b1 = load double, double* getelementptr inbounds ([4 x double], [4 x double]* @b, i32 0, i32 1), align 8
+  %add1 = fadd double %a1, %b1
+  %add2 = fadd double %add0, %add1
+  ret double %add2
+}
+
+; double hadd(double *x) {
+;   return ((x[0] + x[2]) + (x[1] + x[3]));
+; }
+
+define double @return2(double* nocapture readonly %x) {
+; CHECK-LABEL: @return2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret double [[ADD5]]
+;
+entry:
+  %x0 = load double, double* %x, align 4
+  %arrayidx1 = getelementptr inbounds double, double* %x, i32 2
+  %x2 = load double, double* %arrayidx1, align 4
+  %add3 = fadd double %x0, %x2
+  %arrayidx2 = getelementptr inbounds double, double* %x, i32 1
+  %x1 = load double, double* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds double, double* %x, i32 3
+  %x3 = load double, double* %arrayidx3, align 4
+  %add4 = fadd double %x1, %x3
+  %add5 = fadd double %add3, %add4
+  ret double %add5
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
new file mode 100644
index 00000000000..7e7ea7fde25
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+define float @dotf(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @dotf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+entry:
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecext1 = extractelement <4 x float> %y, i32 0
+  %mul = fmul fast float %vecext, %vecext1
+  %vecext.1 = extractelement <4 x float> %x, i32 1
+  %vecext1.1 = extractelement <4 x float> %y, i32 1
+  %mul.1 = fmul fast float %vecext.1, %vecext1.1
+  %add.1 = fadd fast float %mul.1, %mul
+  %vecext.2 = extractelement <4 x float> %x, i32 2
+  %vecext1.2 = extractelement <4 x float> %y, i32 2
+  %mul.2 = fmul fast float %vecext.2, %vecext1.2
+  %add.2 = fadd fast float %mul.2, %add.1
+  %vecext.3 = extractelement <4 x float> %x, i32 3
+  %vecext1.3 = extractelement <4 x float> %y, i32 3
+  %mul.3 = fmul fast float %vecext.3, %vecext1.3
+  %add.3 = fadd fast float %mul.3, %add.2
+  ret float %add.3
+}
+
+define double @dotd(<4 x double>* byval nocapture readonly align 32, <4 x double>* byval nocapture readonly align 32) {
+; CHECK-LABEL: @dotd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32
+; CHECK-NEXT:    [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+entry:
+  %x = load <4 x double>, <4 x double>* %0, align 32
+  %y = load <4 x double>, <4 x double>* %1, align 32
+  %vecext = extractelement <4 x double> %x, i32 0
+  %vecext1 = extractelement <4 x double> %y, i32 0
+  %mul = fmul fast double %vecext, %vecext1
+  %vecext.1 = extractelement <4 x double> %x, i32 1
+  %vecext1.1 = extractelement <4 x double> %y, i32 1
+  %mul.1 = fmul fast double %vecext.1, %vecext1.1
+  %add.1 = fadd fast double %mul.1, %mul
+  %vecext.2 = extractelement <4 x double> %x, i32 2
+  %vecext1.2 = extractelement <4 x double> %y, i32 2
+  %mul.2 = fmul fast double %vecext.2, %vecext1.2
+  %add.2 = fadd fast double %mul.2, %add.1
+  %vecext.3 = extractelement <4 x double> %x, i32 3
+  %vecext1.3 = extractelement <4 x double> %y, i32 3
+  %mul.3 = fmul fast double %vecext.3, %vecext1.3
+  %add.3 = fadd fast double %mul.3, %add.2
+  ret double %add.3
+}
+
+define float @dotfq(<4 x float>* nocapture readonly %x, <4 x float>* nocapture readonly %y) {
+; CHECK-LABEL: @dotfq(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
+;
+entry:
+  %0 = load <4 x float>, <4 x float>* %x, align 16
+  %1 = load <4 x float>, <4 x float>* %y, align 16
+  %vecext = extractelement <4 x float> %0, i32 0
+  %vecext1 = extractelement <4 x float> %1, i32 0
+  %mul = fmul fast float %vecext1, %vecext
+  %vecext.1 = extractelement <4 x float> %0, i32 1
+  %vecext1.1 = extractelement <4 x float> %1, i32 1
+  %mul.1 = fmul fast float %vecext1.1, %vecext.1
+  %add.1 = fadd fast float %mul.1, %mul
+  %vecext.2 = extractelement <4 x float> %0, i32 2
+  %vecext1.2 = extractelement <4 x float> %1, i32 2
+  %mul.2 = fmul fast float %vecext1.2, %vecext.2
+  %add.2 = fadd fast float %mul.2, %add.1
+  %vecext.3 = extractelement <4 x float> %0, i32 3
+  %vecext1.3 = extractelement <4 x float> %1, i32 3
+  %mul.3 = fmul fast float %vecext1.3, %vecext.3
+  %add.3 = fadd fast float %mul.3, %add.2
+  ret float %add.3
+}
+
+define double @dotdq(<4 x double>* nocapture readonly %x, <4 x double>* nocapture readonly %y) {
+; CHECK-LABEL: @dotdq(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
+;
+entry:
+  %0 = load <4 x double>, <4 x double>* %x, align 32
+  %1 = load <4 x double>, <4 x double>* %y, align 32
+  %vecext = extractelement <4 x double> %0, i32 0
+  %vecext1 = extractelement <4 x double> %1, i32 0
+  %mul = fmul fast double %vecext1, %vecext
+  %vecext.1 = extractelement <4 x double> %0, i32 1
+  %vecext1.1 = extractelement <4 x double> %1, i32 1
+  %mul.1 = fmul fast double %vecext1.1, %vecext.1
+  %add.1 = fadd fast double %mul.1, %mul
+  %vecext.2 = extractelement <4 x double> %0, i32 2
+  %vecext1.2 = extractelement <4 x double> %1, i32 2
+  %mul.2 = fmul fast double %vecext1.2, %vecext.2
+  %add.2 = fadd fast double %mul.2, %add.1
+  %vecext.3 = extractelement <4 x double> %0, i32 3
+  %vecext1.3 = extractelement <4 x double> %1, i32 3
+  %mul.3 = fmul fast double %vecext1.3, %vecext.3
+  %add.3 = fadd fast double %mul.3, %add.2
+  ret double %add.3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
new file mode 100644
index 00000000000..c7e419b5f56
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+; We disable the vectorization of <3 x float> for now
+
+; float foo(float *A) {
+;
+;   float R = A[0];
+;   float G = A[1];
+;   float B = A[2];
+;   for (int i=0; i < 121; i+=3) {
+;     R+=A[i+0]*7;
+;     G+=A[i+1]*8;
+;     B+=A[i+2]*9;
+;   }
+;
+;   return R+G+B;
+; }
+
+define float @foo(float* nocapture readonly %A) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00
+; CHECK-NEXT:    [[ADD4]] = fadd float [[R_030]], [[MUL]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00
+; CHECK-NEXT:    [[ADD9]] = fadd float [[G_031]], [[MUL8]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00
+; CHECK-NEXT:    [[ADD14]] = fadd float [[B_032]], [[MUL13]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]]
+; CHECK:       for.body.for.body_crit_edge:
+; CHECK-NEXT:    [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]]
+; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]]
+; CHECK-NEXT:    ret float [[ADD17]]
+;
+entry:
+  %0 = load float, float* %A, align 4
+  %arrayidx1 = getelementptr inbounds float, float* %A, i64 1
+  %1 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %A, i64 2
+  %2 = load float, float* %arrayidx2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.for.body_crit_edge, %entry
+  %3 = phi float [ %0, %entry ], [ %.pre, %for.body.for.body_crit_edge ]
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]
+  %B.032 = phi float [ %2, %entry ], [ %add14, %for.body.for.body_crit_edge ]
+  %G.031 = phi float [ %1, %entry ], [ %add9, %for.body.for.body_crit_edge ]
+  %R.030 = phi float [ %0, %entry ], [ %add4, %for.body.for.body_crit_edge ]
+  %mul = fmul float %3, 7.000000e+00
+  %add4 = fadd float %R.030, %mul
+  %4 = add nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds float, float* %A, i64 %4
+  %5 = load float, float* %arrayidx7, align 4
+  %mul8 = fmul float %5, 8.000000e+00
+  %add9 = fadd float %G.031, %mul8
+  %6 = add nsw i64 %indvars.iv, 2
+  %arrayidx12 = getelementptr inbounds float, float* %A, i64 %6
+  %7 = load float, float* %arrayidx12, align 4
+  %mul13 = fmul float %7, 9.000000e+00
+  %add14 = fadd float %B.032, %mul13
+  %indvars.iv.next = add i64 %indvars.iv, 3
+  %8 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %8, 121
+  br i1 %cmp, label %for.body.for.body_crit_edge, label %for.end
+
+for.body.for.body_crit_edge:                      ; preds = %for.body
+  %arrayidx3.phi.trans.insert = getelementptr inbounds float, float* %A, i64 %indvars.iv.next
+  %.pre = load float, float* %arrayidx3.phi.trans.insert, align 4
+  br label %for.body
+
+for.end:                                          ; preds = %for.body
+  %add16 = fadd float %add4, %add9
+  %add17 = fadd float %add16, %add14
+  ret float %add17
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll b/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll
new file mode 100644
index 00000000000..7e9109a4ef0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; SLP vectorization example from http://cs.stanford.edu/people/eschkufz/research/asplos291-schkufza.pdf
+define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a, i64 %i) {
+; CHECK-LABEL: @SAXPY(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 [[I]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = getelementptr inbounds i32, i32* %x, i64 %i
+  %2 = load i32, i32* %1, align 4
+  %3 = mul nsw i32 %2, %a
+  %4 = getelementptr inbounds i32, i32* %y, i64 %i
+  %5 = load i32, i32* %4, align 4
+  %6 = add nsw i32 %3, %5
+  store i32 %6, i32* %1, align 4
+  %7 = add i64 %i, 1
+  %8 = getelementptr inbounds i32, i32* %x, i64 %7
+  %9 = load i32, i32* %8, align 4
+  %10 = mul nsw i32 %9, %a
+  %11 = getelementptr inbounds i32, i32* %y, i64 %7
+  %12 = load i32, i32* %11, align 4
+  %13 = add nsw i32 %10, %12
+  store i32 %13, i32* %8, align 4
+  %14 = add i64 %i, 2
+  %15 = getelementptr inbounds i32, i32* %x, i64 %14
+  %16 = load i32, i32* %15, align 4
+  %17 = mul nsw i32 %16, %a
+  %18 = getelementptr inbounds i32, i32* %y, i64 %14
+  %19 = load i32, i32* %18, align 4
+  %20 = add nsw i32 %17, %19
+  store i32 %20, i32* %15, align 4
+  %21 = add i64 %i, 3
+  %22 = getelementptr inbounds i32, i32* %x, i64 %21
+  %23 = load i32, i32* %22, align 4
+  %24 = mul nsw i32 %23, %a
+  %25 = getelementptr inbounds i32, i32* %y, i64 %21
+  %26 = load i32, i32* %25, align 4
+  %27 = add nsw i32 %24, %26
+  store i32 %27, i32* %22, align 4
+  ret void
+}
+
+; Make sure we don't crash on this one.
+define void @SAXPY_crash(i32* noalias nocapture %x, i32* noalias nocapture %y, i64 %i) {
+; CHECK-LABEL: @SAXPY_crash(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[I:%.*]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 undef, [[TMP4]]
+; CHECK-NEXT:    store i32 [[TMP5]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[I]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i32 undef, [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP10]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = add i64 %i, 1
+  %2 = getelementptr inbounds i32, i32* %x, i64 %1
+  %3 = getelementptr inbounds i32, i32* %y, i64 %1
+  %4 = load i32, i32* %3, align 4
+  %5 = add nsw i32 undef, %4
+  store i32 %5, i32* %2, align 4
+  %6 = add i64 %i, 2
+  %7 = getelementptr inbounds i32, i32* %x, i64 %6
+  %8 = getelementptr inbounds i32, i32* %y, i64 %6
+  %9 = load i32, i32* %8, align 4
+  %10 = add nsw i32 undef, %9
+  store i32 %10, i32* %7, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
new file mode 100644
index 00000000000..bff947e28ca
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4
+@b = common local_unnamed_addr global [1 x i32] zeroinitializer, align 4
+
+define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
+; CHECK-LABEL: @slp_schedule_bundle(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
+; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
+; CHECK-NEXT:    [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1
+; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
+; CHECK-NEXT:    [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31
+; CHECK-NEXT:    [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1
+; CHECK-NEXT:    store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 0, i64 0), align 4
+  %.lobit = lshr i32 %0, 31
+  %.lobit.not = xor i32 %.lobit, 1
+  store i32 %.lobit.not, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 0, i64 0), align 4
+  %1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 1, i64 0), align 4
+  %.lobit.1 = lshr i32 %1, 31
+  %.lobit.not.1 = xor i32 %.lobit.1, 1
+  store i32 %.lobit.not.1, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @a, i64 1, i64 0), align 4
+  %2 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 2, i64 0), align 4
+  %.lobit.2 = lshr i32 %2, 31
+  %.lobit.not.2 = xor i32 %.lobit.2, 1
+  store i32 %.lobit.not.2, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 2, i64 0), align 4
+  %3 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 3, i64 0), align 4
+  %.lobit.3 = lshr i32 %3, 31
+  %.lobit.not.3 = xor i32 %.lobit.3, 1
+  store i32 %.lobit.not.3, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 3, i64 0), align 4
+  %4 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
+  %.lobit.4 = lshr i32 %4, 31
+  %.lobit.not.4 = xor i32 %.lobit.4, 1
+  store i32 %.lobit.not.4, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4
+  %5 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4
+  %.lobit.5 = lshr i32 %5, 31
+  %.lobit.not.5 = xor i32 %.lobit.5, 1
+  store i32 %.lobit.not.5, i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
new file mode 100644
index 00000000000..0cd08f0e8c3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S  -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Test if the budget for the scheduling region size works.
+; We test with a reduced budget of 16 which should prevent vectorizing the loads.
+
+declare void @unknown()
+
+define void @test(float * %a, float * %b, float * %c, float * %d) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[A:%.*]]
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1
+; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[A1]]
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2
+; CHECK-NEXT:    [[L2:%.*]] = load float, float* [[A2]]
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3
+; CHECK-NEXT:    [[L3:%.*]] = load float, float* [[A3]]
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    store float [[L0]], float* [[B:%.*]]
+; CHECK-NEXT:    [[B1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1
+; CHECK-NEXT:    store float [[L1]], float* [[B1]]
+; CHECK-NEXT:    [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2
+; CHECK-NEXT:    store float [[L2]], float* [[B2]]
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3
+; CHECK-NEXT:    store float [[L3]], float* [[B3]]
+; CHECK-NEXT:    [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1
+; CHECK-NEXT:    [[C2:%.*]] = getelementptr inbounds float, float* [[C]], i64 2
+; CHECK-NEXT:    [[C3:%.*]] = getelementptr inbounds float, float* [[C]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[C]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[D1:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 1
+; CHECK-NEXT:    [[D2:%.*]] = getelementptr inbounds float, float* [[D]], i64 2
+; CHECK-NEXT:    [[D3:%.*]] = getelementptr inbounds float, float* [[D]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[D]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  ; Don't vectorize these loads.
+  %l0 = load float, float* %a
+  %a1 = getelementptr inbounds float, float* %a, i64 1
+  %l1 = load float, float* %a1
+  %a2 = getelementptr inbounds float, float* %a, i64 2
+  %l2 = load float, float* %a2
+  %a3 = getelementptr inbounds float, float* %a, i64 3
+  %l3 = load float, float* %a3
+
+  ; some unrelated instructions inbetween to enlarge the scheduling region
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+  call void @unknown()
+
+  ; Don't vectorize these stores because their operands are too far away.
+  store float %l0, float* %b
+  %b1 = getelementptr inbounds float, float* %b, i64 1
+  store float %l1, float* %b1
+  %b2 = getelementptr inbounds float, float* %b, i64 2
+  store float %l2, float* %b2
+  %b3 = getelementptr inbounds float, float* %b, i64 3
+  store float %l3, float* %b3
+
+  ; But still vectorize the following instructions, because even if the budget
+  ; is exceeded there is a minimum region size.
+  %l4 = load float, float* %c
+  %c1 = getelementptr inbounds float, float* %c, i64 1
+  %l5 = load float, float* %c1
+  %c2 = getelementptr inbounds float, float* %c, i64 2
+  %l6 = load float, float* %c2
+  %c3 = getelementptr inbounds float, float* %c, i64 3
+  %l7 = load float, float* %c3
+
+  store float %l4, float* %d
+  %d1 = getelementptr inbounds float, float* %d, i64 1
+  store float %l5, float* %d1
+  %d2 = getelementptr inbounds float, float* %d, i64 2
+  store float %l6, float* %d2
+  %d3 = getelementptr inbounds float, float* %d, i64 3
+  store float %l7, float* %d3
+
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
new file mode 100644
index 00000000000..c4f521c8963
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+define i32 @foo(i32* nocapture readonly %diff) #0 {
+; CHECK-LABEL: @foo(
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: [[S1:%.+]] = add nsw <4 x i32>
+; CHECK: store <4 x i32> [[S1]],
+; CHECK:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[S1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK:         [[ADD52:%.*]] = add nsw i32 [[TMP15]],
+; CHECK:          ret i32 [[ADD52]]
+;
+entry:
+  %m2 = alloca [8 x [8 x i32]], align 16
+  %0 = bitcast [8 x [8 x i32]]* %m2 to i8*
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %a.088 = phi i32 [ 0, %entry ], [ %add52, %for.body ]
+  %1 = shl i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds i32, i32* %diff, i64 %1
+  %2 = load i32, i32* %arrayidx, align 4
+  %3 = or i64 %1, 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %diff, i64 %3
+  %4 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %4, %2
+  %arrayidx6 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 0
+  store i32 %add3, i32* %arrayidx6, align 16
+  %add10 = add nsw i32 %add3, %a.088
+  %5 = or i64 %1, 1
+  %arrayidx13 = getelementptr inbounds i32, i32* %diff, i64 %5
+  %6 = load i32, i32* %arrayidx13, align 4
+  %7 = or i64 %1, 5
+  %arrayidx16 = getelementptr inbounds i32, i32* %diff, i64 %7
+  %8 = load i32, i32* %arrayidx16, align 4
+  %add17 = add nsw i32 %8, %6
+  %arrayidx20 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 1
+  store i32 %add17, i32* %arrayidx20, align 4
+  %add24 = add nsw i32 %add10, %add17
+  %9 = or i64 %1, 2
+  %arrayidx27 = getelementptr inbounds i32, i32* %diff, i64 %9
+  %10 = load i32, i32* %arrayidx27, align 4
+  %11 = or i64 %1, 6
+  %arrayidx30 = getelementptr inbounds i32, i32* %diff, i64 %11
+  %12 = load i32, i32* %arrayidx30, align 4
+  %add31 = add nsw i32 %12, %10
+  %arrayidx34 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 2
+  store i32 %add31, i32* %arrayidx34, align 8
+  %add38 = add nsw i32 %add24, %add31
+  %13 = or i64 %1, 3
+  %arrayidx41 = getelementptr inbounds i32, i32* %diff, i64 %13
+  %14 = load i32, i32* %arrayidx41, align 4
+  %15 = or i64 %1, 7
+  %arrayidx44 = getelementptr inbounds i32, i32* %diff, i64 %15
+  %16 = load i32, i32* %arrayidx44, align 4
+  %add45 = add nsw i32 %16, %14
+  %arrayidx48 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 %indvars.iv, i64 3
+  store i32 %add45, i32* %arrayidx48, align 4
+  %add52 = add nsw i32 %add38, %add45
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %arraydecay = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %m2, i64 0, i64 0
+  call void @ff([8 x i32]* %arraydecay) #1
+  ret i32 %add52
+}
+
+declare void @ff([8 x i32]*) #2
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
new file mode 100644
index 00000000000..75406c927d9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -0,0 +1,911 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
+
+;
+; vXi8
+;
+
+define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i8_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %x0 = sext i8 %i0 to i64
+  %x1 = sext i8 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = sext i8 %i0 to i32
+  %x1 = sext i8 %i1 to i32
+  %x2 = sext i8 %i2 to i32
+  %x3 = sext i8 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = sext i8 %i0 to i64
+  %x1 = sext i8 %i1 to i64
+  %x2 = sext i8 %i2 to i64
+  %x3 = sext i8 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i16> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = sext i8 %i0 to i16
+  %x1 = sext i8 %i1 to i16
+  %x2 = sext i8 %i2 to i16
+  %x3 = sext i8 %i3 to i16
+  %x4 = sext i8 %i4 to i16
+  %x5 = sext i8 %i5 to i16
+  %x6 = sext i8 %i6 to i16
+  %x7 = sext i8 %i7 to i16
+  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
+  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
+  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
+  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
+  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
+  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
+  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
+  ret <8 x i16> %v7
+}
+
+define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
+; SSE-LABEL: @loadext_8i8_to_8i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX1-LABEL: @loadext_8i8_to_8i32(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX1-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX1-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX1-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX1-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; AVX1-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; AVX1-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; AVX1-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX1-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i32
+; AVX1-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i32
+; AVX1-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i32
+; AVX1-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i32
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX1-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX1-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX1-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; AVX1-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; AVX1-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; AVX1-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; AVX1-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX2-LABEL: @loadext_8i8_to_8i32(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX2-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX512-LABEL: @loadext_8i8_to_8i32(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX512-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX512-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX512-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX512-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX512-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX512-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX512-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX512-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX512-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX512-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX512-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX512-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX512-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = sext i8 %i0 to i32
+  %x1 = sext i8 %i1 to i32
+  %x2 = sext i8 %i2 to i32
+  %x3 = sext i8 %i3 to i32
+  %x4 = sext i8 %i4 to i32
+  %x5 = sext i8 %i5 to i32
+  %x6 = sext i8 %i6 to i32
+  %x7 = sext i8 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
+; CHECK-LABEL: @loadext_16i8_to_16i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; CHECK-NEXT:    ret <16 x i16> [[V15]]
+;
+  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
+  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
+  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
+  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
+  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
+  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
+  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
+  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
+  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
+  %i0  = load i8, i8* %p0,  align 1
+  %i1  = load i8, i8* %p1,  align 1
+  %i2  = load i8, i8* %p2,  align 1
+  %i3  = load i8, i8* %p3,  align 1
+  %i4  = load i8, i8* %p4,  align 1
+  %i5  = load i8, i8* %p5,  align 1
+  %i6  = load i8, i8* %p6,  align 1
+  %i7  = load i8, i8* %p7,  align 1
+  %i8  = load i8, i8* %p8,  align 1
+  %i9  = load i8, i8* %p9,  align 1
+  %i10 = load i8, i8* %p10, align 1
+  %i11 = load i8, i8* %p11, align 1
+  %i12 = load i8, i8* %p12, align 1
+  %i13 = load i8, i8* %p13, align 1
+  %i14 = load i8, i8* %p14, align 1
+  %i15 = load i8, i8* %p15, align 1
+  %x0  = sext i8 %i0  to i16
+  %x1  = sext i8 %i1  to i16
+  %x2  = sext i8 %i2  to i16
+  %x3  = sext i8 %i3  to i16
+  %x4  = sext i8 %i4  to i16
+  %x5  = sext i8 %i5  to i16
+  %x6  = sext i8 %i6  to i16
+  %x7  = sext i8 %i7  to i16
+  %x8  = sext i8 %i8  to i16
+  %x9  = sext i8 %i9  to i16
+  %x10 = sext i8 %i10 to i16
+  %x11 = sext i8 %i11 to i16
+  %x12 = sext i8 %i12 to i16
+  %x13 = sext i8 %i13 to i16
+  %x14 = sext i8 %i14 to i16
+  %x15 = sext i8 %i15 to i16
+  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
+  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
+  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
+  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
+  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
+  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
+  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
+  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
+  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
+  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
+  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
+  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
+  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
+  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
+  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
+  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
+  ret <16 x i16> %v15
+}
+
+;
+; vXi16
+;
+
+define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i16_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %x0 = sext i16 %i0 to i64
+  %x1 = sext i16 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
+; CHECK-LABEL: @loadext_4i16_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = sext i16 %i0 to i32
+  %x1 = sext i16 %i1 to i32
+  %x2 = sext i16 %i2 to i32
+  %x3 = sext i16 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = sext i16 %i0 to i64
+  %x1 = sext i16 %i1 to i64
+  %x2 = sext i16 %i2 to i64
+  %x3 = sext i16 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
+; CHECK-LABEL: @loadext_8i16_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
+  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
+  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
+  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %i4 = load i16, i16* %p4, align 1
+  %i5 = load i16, i16* %p5, align 1
+  %i6 = load i16, i16* %p6, align 1
+  %i7 = load i16, i16* %p7, align 1
+  %x0 = sext i16 %i0 to i32
+  %x1 = sext i16 %i1 to i32
+  %x2 = sext i16 %i2 to i32
+  %x3 = sext i16 %i3 to i32
+  %x4 = sext i16 %i4 to i32
+  %x5 = sext i16 %i5 to i32
+  %x6 = sext i16 %i6 to i32
+  %x7 = sext i16 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+;
+; vXi32
+;
+
+define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i32_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %x0 = sext i32 %i0 to i64
+  %x1 = sext i32 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX1-LABEL: @loadext_4i32_to_4i64(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX1-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; AVX1-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i32_to_4i64(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i32_to_4i64(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
+  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %i2 = load i32, i32* %p2, align 1
+  %i3 = load i32, i32* %p3, align 1
+  %x0 = sext i32 %i0 to i64
+  %x1 = sext i32 %i1 to i64
+  %x2 = sext i32 %i2 to i64
+  %x3 = sext i32 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
new file mode 100644
index 00000000000..e4b71ba974d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll
@@ -0,0 +1,914 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @ashr_v8i64() {
+; SSE-LABEL: @ashr_v8i64(
+; SSE-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SSE-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SSE-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SSE-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SSE-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; SSE-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SSE-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SSE-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SSE-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SSE-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SSE-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SSE-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SSE-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ashr_v8i64(
+; AVX1-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; AVX1-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; AVX1-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; AVX1-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; AVX1-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; AVX1-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; AVX1-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; AVX1-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; AVX1-NEXT:    [[R0:%.*]] = ashr i64 [[A0]], [[B0]]
+; AVX1-NEXT:    [[R1:%.*]] = ashr i64 [[A1]], [[B1]]
+; AVX1-NEXT:    [[R2:%.*]] = ashr i64 [[A2]], [[B2]]
+; AVX1-NEXT:    [[R3:%.*]] = ashr i64 [[A3]], [[B3]]
+; AVX1-NEXT:    [[R4:%.*]] = ashr i64 [[A4]], [[B4]]
+; AVX1-NEXT:    [[R5:%.*]] = ashr i64 [[A5]], [[B5]]
+; AVX1-NEXT:    [[R6:%.*]] = ashr i64 [[A6]], [[B6]]
+; AVX1-NEXT:    [[R7:%.*]] = ashr i64 [[A7]], [[B7]]
+; AVX1-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; AVX1-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; AVX1-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; AVX1-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; AVX1-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; AVX1-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; AVX1-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; AVX1-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = ashr i64 %a0, %b0
+  %r1 = ashr i64 %a1, %b1
+  %r2 = ashr i64 %a2, %b2
+  %r3 = ashr i64 %a3, %b3
+  %r4 = ashr i64 %a4, %b4
+  %r5 = ashr i64 %a5, %b5
+  %r6 = ashr i64 %a6, %b6
+  %r7 = ashr i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @ashr_v16i32() {
+; SSE-LABEL: @ashr_v16i32(
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = ashr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = ashr i32 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = ashr i32 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = ashr i32 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = ashr i32 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = ashr i32 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = ashr i32 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = ashr i32 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = ashr i32 [[A15]], [[B15]]
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ashr_v16i32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @ashr_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = ashr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = ashr i32 %a0 , %b0
+  %r1  = ashr i32 %a1 , %b1
+  %r2  = ashr i32 %a2 , %b2
+  %r3  = ashr i32 %a3 , %b3
+  %r4  = ashr i32 %a4 , %b4
+  %r5  = ashr i32 %a5 , %b5
+  %r6  = ashr i32 %a6 , %b6
+  %r7  = ashr i32 %a7 , %b7
+  %r8  = ashr i32 %a8 , %b8
+  %r9  = ashr i32 %a9 , %b9
+  %r10 = ashr i32 %a10, %b10
+  %r11 = ashr i32 %a11, %b11
+  %r12 = ashr i32 %a12, %b12
+  %r13 = ashr i32 %a13, %b13
+  %r14 = ashr i32 %a14, %b14
+  %r15 = ashr i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @ashr_v32i16() {
+; SSE-LABEL: @ashr_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = ashr i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = ashr i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = ashr i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = ashr i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = ashr i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = ashr i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = ashr i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = ashr i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = ashr i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = ashr i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = ashr i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = ashr i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = ashr i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = ashr i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = ashr i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = ashr i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = ashr i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = ashr i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = ashr i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = ashr i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = ashr i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = ashr i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = ashr i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = ashr i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = ashr i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = ashr i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = ashr i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = ashr i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = ashr i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = ashr i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = ashr i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = ashr i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @ashr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @ashr_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @ashr_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = ashr i16 %a0 , %b0
+  %r1  = ashr i16 %a1 , %b1
+  %r2  = ashr i16 %a2 , %b2
+  %r3  = ashr i16 %a3 , %b3
+  %r4  = ashr i16 %a4 , %b4
+  %r5  = ashr i16 %a5 , %b5
+  %r6  = ashr i16 %a6 , %b6
+  %r7  = ashr i16 %a7 , %b7
+  %r8  = ashr i16 %a8 , %b8
+  %r9  = ashr i16 %a9 , %b9
+  %r10 = ashr i16 %a10, %b10
+  %r11 = ashr i16 %a11, %b11
+  %r12 = ashr i16 %a12, %b12
+  %r13 = ashr i16 %a13, %b13
+  %r14 = ashr i16 %a14, %b14
+  %r15 = ashr i16 %a15, %b15
+  %r16 = ashr i16 %a16, %b16
+  %r17 = ashr i16 %a17, %b17
+  %r18 = ashr i16 %a18, %b18
+  %r19 = ashr i16 %a19, %b19
+  %r20 = ashr i16 %a20, %b20
+  %r21 = ashr i16 %a21, %b21
+  %r22 = ashr i16 %a22, %b22
+  %r23 = ashr i16 %a23, %b23
+  %r24 = ashr i16 %a24, %b24
+  %r25 = ashr i16 %a25, %b25
+  %r26 = ashr i16 %a26, %b26
+  %r27 = ashr i16 %a27, %b27
+  %r28 = ashr i16 %a28, %b28
+  %r29 = ashr i16 %a29, %b29
+  %r30 = ashr i16 %a30, %b30
+  %r31 = ashr i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @ashr_v64i8() {
+; CHECK-LABEL: @ashr_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = ashr i8 %a0 , %b0
+  %r1  = ashr i8 %a1 , %b1
+  %r2  = ashr i8 %a2 , %b2
+  %r3  = ashr i8 %a3 , %b3
+  %r4  = ashr i8 %a4 , %b4
+  %r5  = ashr i8 %a5 , %b5
+  %r6  = ashr i8 %a6 , %b6
+  %r7  = ashr i8 %a7 , %b7
+  %r8  = ashr i8 %a8 , %b8
+  %r9  = ashr i8 %a9 , %b9
+  %r10 = ashr i8 %a10, %b10
+  %r11 = ashr i8 %a11, %b11
+  %r12 = ashr i8 %a12, %b12
+  %r13 = ashr i8 %a13, %b13
+  %r14 = ashr i8 %a14, %b14
+  %r15 = ashr i8 %a15, %b15
+  %r16 = ashr i8 %a16, %b16
+  %r17 = ashr i8 %a17, %b17
+  %r18 = ashr i8 %a18, %b18
+  %r19 = ashr i8 %a19, %b19
+  %r20 = ashr i8 %a20, %b20
+  %r21 = ashr i8 %a21, %b21
+  %r22 = ashr i8 %a22, %b22
+  %r23 = ashr i8 %a23, %b23
+  %r24 = ashr i8 %a24, %b24
+  %r25 = ashr i8 %a25, %b25
+  %r26 = ashr i8 %a26, %b26
+  %r27 = ashr i8 %a27, %b27
+  %r28 = ashr i8 %a28, %b28
+  %r29 = ashr i8 %a29, %b29
+  %r30 = ashr i8 %a30, %b30
+  %r31 = ashr i8 %a31, %b31
+  %r32 = ashr i8 %a32, %b32
+  %r33 = ashr i8 %a33, %b33
+  %r34 = ashr i8 %a34, %b34
+  %r35 = ashr i8 %a35, %b35
+  %r36 = ashr i8 %a36, %b36
+  %r37 = ashr i8 %a37, %b37
+  %r38 = ashr i8 %a38, %b38
+  %r39 = ashr i8 %a39, %b39
+  %r40 = ashr i8 %a40, %b40
+  %r41 = ashr i8 %a41, %b41
+  %r42 = ashr i8 %a42, %b42
+  %r43 = ashr i8 %a43, %b43
+  %r44 = ashr i8 %a44, %b44
+  %r45 = ashr i8 %a45, %b45
+  %r46 = ashr i8 %a46, %b46
+  %r47 = ashr i8 %a47, %b47
+  %r48 = ashr i8 %a48, %b48
+  %r49 = ashr i8 %a49, %b49
+  %r50 = ashr i8 %a50, %b50
+  %r51 = ashr i8 %a51, %b51
+  %r52 = ashr i8 %a52, %b52
+  %r53 = ashr i8 %a53, %b53
+  %r54 = ashr i8 %a54, %b54
+  %r55 = ashr i8 %a55, %b55
+  %r56 = ashr i8 %a56, %b56
+  %r57 = ashr i8 %a57, %b57
+  %r58 = ashr i8 %a58, %b58
+  %r59 = ashr i8 %a59, %b59
+  %r60 = ashr i8 %a60, %b60
+  %r61 = ashr i8 %a61, %b61
+  %r62 = ashr i8 %a62, %b62
+  %r63 = ashr i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
new file mode 100644
index 00000000000..237673da5a9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll
@@ -0,0 +1,863 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @lshr_v8i64() {
+; SSE-LABEL: @lshr_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @lshr_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @lshr_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = lshr <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = lshr i64 %a0, %b0
+  %r1 = lshr i64 %a1, %b1
+  %r2 = lshr i64 %a2, %b2
+  %r3 = lshr i64 %a3, %b3
+  %r4 = lshr i64 %a4, %b4
+  %r5 = lshr i64 %a5, %b5
+  %r6 = lshr i64 %a6, %b6
+  %r7 = lshr i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @lshr_v16i32() {
+; SSE-LABEL: @lshr_v16i32(
+; SSE-NEXT:    [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[B0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0), align 4
+; SSE-NEXT:    [[B1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[B2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[B3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[B4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4), align 4
+; SSE-NEXT:    [[B5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[B6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6), align 4
+; SSE-NEXT:    [[B7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[B8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8), align 4
+; SSE-NEXT:    [[B9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9), align 4
+; SSE-NEXT:    [[B10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+; SSE-NEXT:    [[B11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+; SSE-NEXT:    [[B12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+; SSE-NEXT:    [[B13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+; SSE-NEXT:    [[B14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+; SSE-NEXT:    [[B15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+; SSE-NEXT:    [[R0:%.*]] = lshr i32 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = lshr i32 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = lshr i32 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = lshr i32 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = lshr i32 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = lshr i32 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = lshr i32 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = lshr i32 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = lshr i32 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = lshr i32 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = lshr i32 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = lshr i32 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = lshr i32 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = lshr i32 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = lshr i32 [[A15]], [[B15]]
+; SSE-NEXT:    store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4
+; SSE-NEXT:    store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4
+; SSE-NEXT:    store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4
+; SSE-NEXT:    store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4
+; SSE-NEXT:    store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4
+; SSE-NEXT:    store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4
+; SSE-NEXT:    store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4
+; SSE-NEXT:    store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4
+; SSE-NEXT:    store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+; SSE-NEXT:    store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+; SSE-NEXT:    store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+; SSE-NEXT:    store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+; SSE-NEXT:    store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+; SSE-NEXT:    store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @lshr_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = lshr <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = lshr i32 %a0 , %b0
+  %r1  = lshr i32 %a1 , %b1
+  %r2  = lshr i32 %a2 , %b2
+  %r3  = lshr i32 %a3 , %b3
+  %r4  = lshr i32 %a4 , %b4
+  %r5  = lshr i32 %a5 , %b5
+  %r6  = lshr i32 %a6 , %b6
+  %r7  = lshr i32 %a7 , %b7
+  %r8  = lshr i32 %a8 , %b8
+  %r9  = lshr i32 %a9 , %b9
+  %r10 = lshr i32 %a10, %b10
+  %r11 = lshr i32 %a11, %b11
+  %r12 = lshr i32 %a12, %b12
+  %r13 = lshr i32 %a13, %b13
+  %r14 = lshr i32 %a14, %b14
+  %r15 = lshr i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @lshr_v32i16() {
+; SSE-LABEL: @lshr_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = lshr i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = lshr i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = lshr i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = lshr i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = lshr i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = lshr i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = lshr i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = lshr i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = lshr i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = lshr i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = lshr i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = lshr i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = lshr i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = lshr i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = lshr i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = lshr i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = lshr i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = lshr i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = lshr i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = lshr i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = lshr i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = lshr i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = lshr i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = lshr i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = lshr i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = lshr i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = lshr i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = lshr i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = lshr i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = lshr i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = lshr i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = lshr i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @lshr_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @lshr_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @lshr_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = lshr i16 %a0 , %b0
+  %r1  = lshr i16 %a1 , %b1
+  %r2  = lshr i16 %a2 , %b2
+  %r3  = lshr i16 %a3 , %b3
+  %r4  = lshr i16 %a4 , %b4
+  %r5  = lshr i16 %a5 , %b5
+  %r6  = lshr i16 %a6 , %b6
+  %r7  = lshr i16 %a7 , %b7
+  %r8  = lshr i16 %a8 , %b8
+  %r9  = lshr i16 %a9 , %b9
+  %r10 = lshr i16 %a10, %b10
+  %r11 = lshr i16 %a11, %b11
+  %r12 = lshr i16 %a12, %b12
+  %r13 = lshr i16 %a13, %b13
+  %r14 = lshr i16 %a14, %b14
+  %r15 = lshr i16 %a15, %b15
+  %r16 = lshr i16 %a16, %b16
+  %r17 = lshr i16 %a17, %b17
+  %r18 = lshr i16 %a18, %b18
+  %r19 = lshr i16 %a19, %b19
+  %r20 = lshr i16 %a20, %b20
+  %r21 = lshr i16 %a21, %b21
+  %r22 = lshr i16 %a22, %b22
+  %r23 = lshr i16 %a23, %b23
+  %r24 = lshr i16 %a24, %b24
+  %r25 = lshr i16 %a25, %b25
+  %r26 = lshr i16 %a26, %b26
+  %r27 = lshr i16 %a27, %b27
+  %r28 = lshr i16 %a28, %b28
+  %r29 = lshr i16 %a29, %b29
+  %r30 = lshr i16 %a30, %b30
+  %r31 = lshr i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @lshr_v64i8() {
+; CHECK-LABEL: @lshr_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = lshr i8 %a0 , %b0
+  %r1  = lshr i8 %a1 , %b1
+  %r2  = lshr i8 %a2 , %b2
+  %r3  = lshr i8 %a3 , %b3
+  %r4  = lshr i8 %a4 , %b4
+  %r5  = lshr i8 %a5 , %b5
+  %r6  = lshr i8 %a6 , %b6
+  %r7  = lshr i8 %a7 , %b7
+  %r8  = lshr i8 %a8 , %b8
+  %r9  = lshr i8 %a9 , %b9
+  %r10 = lshr i8 %a10, %b10
+  %r11 = lshr i8 %a11, %b11
+  %r12 = lshr i8 %a12, %b12
+  %r13 = lshr i8 %a13, %b13
+  %r14 = lshr i8 %a14, %b14
+  %r15 = lshr i8 %a15, %b15
+  %r16 = lshr i8 %a16, %b16
+  %r17 = lshr i8 %a17, %b17
+  %r18 = lshr i8 %a18, %b18
+  %r19 = lshr i8 %a19, %b19
+  %r20 = lshr i8 %a20, %b20
+  %r21 = lshr i8 %a21, %b21
+  %r22 = lshr i8 %a22, %b22
+  %r23 = lshr i8 %a23, %b23
+  %r24 = lshr i8 %a24, %b24
+  %r25 = lshr i8 %a25, %b25
+  %r26 = lshr i8 %a26, %b26
+  %r27 = lshr i8 %a27, %b27
+  %r28 = lshr i8 %a28, %b28
+  %r29 = lshr i8 %a29, %b29
+  %r30 = lshr i8 %a30, %b30
+  %r31 = lshr i8 %a31, %b31
+  %r32 = lshr i8 %a32, %b32
+  %r33 = lshr i8 %a33, %b33
+  %r34 = lshr i8 %a34, %b34
+  %r35 = lshr i8 %a35, %b35
+  %r36 = lshr i8 %a36, %b36
+  %r37 = lshr i8 %a37, %b37
+  %r38 = lshr i8 %a38, %b38
+  %r39 = lshr i8 %a39, %b39
+  %r40 = lshr i8 %a40, %b40
+  %r41 = lshr i8 %a41, %b41
+  %r42 = lshr i8 %a42, %b42
+  %r43 = lshr i8 %a43, %b43
+  %r44 = lshr i8 %a44, %b44
+  %r45 = lshr i8 %a45, %b45
+  %r46 = lshr i8 %a46, %b46
+  %r47 = lshr i8 %a47, %b47
+  %r48 = lshr i8 %a48, %b48
+  %r49 = lshr i8 %a49, %b49
+  %r50 = lshr i8 %a50, %b50
+  %r51 = lshr i8 %a51, %b51
+  %r52 = lshr i8 %a52, %b52
+  %r53 = lshr i8 %a53, %b53
+  %r54 = lshr i8 %a54, %b54
+  %r55 = lshr i8 %a55, %b55
+  %r56 = lshr i8 %a56, %b56
+  %r57 = lshr i8 %a57, %b57
+  %r58 = lshr i8 %a58, %b58
+  %r59 = lshr i8 %a59, %b59
+  %r60 = lshr i8 %a60, %b60
+  %r61 = lshr i8 %a61, %b61
+  %r62 = lshr i8 %a62, %b62
+  %r63 = lshr i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
new file mode 100644
index 00000000000..8eadd04d1d0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll
@@ -0,0 +1,815 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
+
+@a64 = common global [8 x i64] zeroinitializer, align 64
+@b64 = common global [8 x i64] zeroinitializer, align 64
+@c64 = common global [8 x i64] zeroinitializer, align 64
+@a32 = common global [16 x i32] zeroinitializer, align 64
+@b32 = common global [16 x i32] zeroinitializer, align 64
+@c32 = common global [16 x i32] zeroinitializer, align 64
+@a16 = common global [32 x i16] zeroinitializer, align 64
+@b16 = common global [32 x i16] zeroinitializer, align 64
+@c16 = common global [32 x i16] zeroinitializer, align 64
+@a8  = common global [64 x i8] zeroinitializer, align 64
+@b8  = common global [64 x i8] zeroinitializer, align 64
+@c8  = common global [64 x i8] zeroinitializer, align 64
+
+define void @shl_v8i64() {
+; SSE-LABEL: @shl_v8i64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; SSE-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @shl_v8i64(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]]
+; AVX1-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]]
+; AVX1-NEXT:    [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]]
+; AVX1-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
+; AVX1-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @shl_v8i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; AVX2-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; AVX2-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v8i64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @a64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @b64 to <8 x i64>*), align 8
+; AVX512-NEXT:    [[TMP3:%.*]] = shl <8 x i64> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <8 x i64> [[TMP3]], <8 x i64>* bitcast ([8 x i64]* @c64 to <8 x i64>*), align 8
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v8i64(
+; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8
+; XOP-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8
+; XOP-NEXT:    ret void
+;
+  %a0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+  %a1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+  %a2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+  %a3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+  %a4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+  %a5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+  %a6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+  %a7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+  %b0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+  %b1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+  %b2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+  %b3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+  %b4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+  %b5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+  %b6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+  %b7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+  %r0 = shl i64 %a0, %b0
+  %r1 = shl i64 %a1, %b1
+  %r2 = shl i64 %a2, %b2
+  %r3 = shl i64 %a3, %b3
+  %r4 = shl i64 %a4, %b4
+  %r5 = shl i64 %a5, %b5
+  %r6 = shl i64 %a6, %b6
+  %r7 = shl i64 %a7, %b7
+  store i64 %r0, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+  store i64 %r1, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+  store i64 %r2, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+  store i64 %r3, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+  store i64 %r4, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+  store i64 %r5, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+  store i64 %r6, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+  store i64 %r7, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @shl_v16i32() {
+; SSE-LABEL: @shl_v16i32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    [[TMP9:%.*]] = shl <4 x i32> [[TMP1]], [[TMP5]]
+; SSE-NEXT:    [[TMP10:%.*]] = shl <4 x i32> [[TMP2]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = shl <4 x i32> [[TMP3]], [[TMP7]]
+; SSE-NEXT:    [[TMP12:%.*]] = shl <4 x i32> [[TMP4]], [[TMP8]]
+; SSE-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
+; SSE-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @shl_v16i32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; AVX-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v16i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @b32 to <16 x i32>*), align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = shl <16 x i32> [[TMP1]], [[TMP2]]
+; AVX512-NEXT:    store <16 x i32> [[TMP3]], <16 x i32>* bitcast ([16 x i32]* @c32 to <16 x i32>*), align 4
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v16i32(
+; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4
+; XOP-NEXT:    store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4
+; XOP-NEXT:    ret void
+;
+  %a0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0 ), align 4
+  %a1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1 ), align 4
+  %a2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2 ), align 4
+  %a3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3 ), align 4
+  %a4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4 ), align 4
+  %a5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5 ), align 4
+  %a6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6 ), align 4
+  %a7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7 ), align 4
+  %a8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8 ), align 4
+  %a9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9 ), align 4
+  %a10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4
+  %a11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4
+  %a12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4
+  %a13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4
+  %a14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4
+  %a15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4
+  %b0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 0 ), align 4
+  %b1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 1 ), align 4
+  %b2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 2 ), align 4
+  %b3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 3 ), align 4
+  %b4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4 ), align 4
+  %b5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 5 ), align 4
+  %b6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 6 ), align 4
+  %b7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 7 ), align 4
+  %b8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8 ), align 4
+  %b9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 9 ), align 4
+  %b10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 10), align 4
+  %b11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 11), align 4
+  %b12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12), align 4
+  %b13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 13), align 4
+  %b14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 14), align 4
+  %b15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 15), align 4
+  %r0  = shl i32 %a0 , %b0
+  %r1  = shl i32 %a1 , %b1
+  %r2  = shl i32 %a2 , %b2
+  %r3  = shl i32 %a3 , %b3
+  %r4  = shl i32 %a4 , %b4
+  %r5  = shl i32 %a5 , %b5
+  %r6  = shl i32 %a6 , %b6
+  %r7  = shl i32 %a7 , %b7
+  %r8  = shl i32 %a8 , %b8
+  %r9  = shl i32 %a9 , %b9
+  %r10 = shl i32 %a10, %b10
+  %r11 = shl i32 %a11, %b11
+  %r12 = shl i32 %a12, %b12
+  %r13 = shl i32 %a13, %b13
+  %r14 = shl i32 %a14, %b14
+  %r15 = shl i32 %a15, %b15
+  store i32 %r0 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0 ), align 4
+  store i32 %r1 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1 ), align 4
+  store i32 %r2 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2 ), align 4
+  store i32 %r3 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3 ), align 4
+  store i32 %r4 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4 ), align 4
+  store i32 %r5 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5 ), align 4
+  store i32 %r6 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6 ), align 4
+  store i32 %r7 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7 ), align 4
+  store i32 %r8 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8 ), align 4
+  store i32 %r9 , i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9 ), align 4
+  store i32 %r10, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4
+  store i32 %r11, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4
+  store i32 %r12, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4
+  store i32 %r13, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4
+  store i32 %r14, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4
+  store i32 %r15, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @shl_v32i16() {
+; SSE-LABEL: @shl_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0), align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = shl i16 [[A0]], [[B0]]
+; SSE-NEXT:    [[R1:%.*]] = shl i16 [[A1]], [[B1]]
+; SSE-NEXT:    [[R2:%.*]] = shl i16 [[A2]], [[B2]]
+; SSE-NEXT:    [[R3:%.*]] = shl i16 [[A3]], [[B3]]
+; SSE-NEXT:    [[R4:%.*]] = shl i16 [[A4]], [[B4]]
+; SSE-NEXT:    [[R5:%.*]] = shl i16 [[A5]], [[B5]]
+; SSE-NEXT:    [[R6:%.*]] = shl i16 [[A6]], [[B6]]
+; SSE-NEXT:    [[R7:%.*]] = shl i16 [[A7]], [[B7]]
+; SSE-NEXT:    [[R8:%.*]] = shl i16 [[A8]], [[B8]]
+; SSE-NEXT:    [[R9:%.*]] = shl i16 [[A9]], [[B9]]
+; SSE-NEXT:    [[R10:%.*]] = shl i16 [[A10]], [[B10]]
+; SSE-NEXT:    [[R11:%.*]] = shl i16 [[A11]], [[B11]]
+; SSE-NEXT:    [[R12:%.*]] = shl i16 [[A12]], [[B12]]
+; SSE-NEXT:    [[R13:%.*]] = shl i16 [[A13]], [[B13]]
+; SSE-NEXT:    [[R14:%.*]] = shl i16 [[A14]], [[B14]]
+; SSE-NEXT:    [[R15:%.*]] = shl i16 [[A15]], [[B15]]
+; SSE-NEXT:    [[R16:%.*]] = shl i16 [[A16]], [[B16]]
+; SSE-NEXT:    [[R17:%.*]] = shl i16 [[A17]], [[B17]]
+; SSE-NEXT:    [[R18:%.*]] = shl i16 [[A18]], [[B18]]
+; SSE-NEXT:    [[R19:%.*]] = shl i16 [[A19]], [[B19]]
+; SSE-NEXT:    [[R20:%.*]] = shl i16 [[A20]], [[B20]]
+; SSE-NEXT:    [[R21:%.*]] = shl i16 [[A21]], [[B21]]
+; SSE-NEXT:    [[R22:%.*]] = shl i16 [[A22]], [[B22]]
+; SSE-NEXT:    [[R23:%.*]] = shl i16 [[A23]], [[B23]]
+; SSE-NEXT:    [[R24:%.*]] = shl i16 [[A24]], [[B24]]
+; SSE-NEXT:    [[R25:%.*]] = shl i16 [[A25]], [[B25]]
+; SSE-NEXT:    [[R26:%.*]] = shl i16 [[A26]], [[B26]]
+; SSE-NEXT:    [[R27:%.*]] = shl i16 [[A27]], [[B27]]
+; SSE-NEXT:    [[R28:%.*]] = shl i16 [[A28]], [[B28]]
+; SSE-NEXT:    [[R29:%.*]] = shl i16 [[A29]], [[B29]]
+; SSE-NEXT:    [[R30:%.*]] = shl i16 [[A30]], [[B30]]
+; SSE-NEXT:    [[R31:%.*]] = shl i16 [[A31]], [[B31]]
+; SSE-NEXT:    store i16 [[R0]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0), align 2
+; SSE-NEXT:    store i16 [[R1]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @shl_v32i16(
+; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX-NEXT:    ret void
+;
+; AVX512-LABEL: @shl_v32i16(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; AVX512-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; AVX512-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; AVX512-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; AVX512-NEXT:    ret void
+;
+; XOP-LABEL: @shl_v32i16(
+; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]]
+; XOP-NEXT:    [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]]
+; XOP-NEXT:    store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2
+; XOP-NEXT:    store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2
+; XOP-NEXT:    ret void
+;
+  %a0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 0 ), align 2
+  %a1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 1 ), align 2
+  %a2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 2 ), align 2
+  %a3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 3 ), align 2
+  %a4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 4 ), align 2
+  %a5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 5 ), align 2
+  %a6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 6 ), align 2
+  %a7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 7 ), align 2
+  %a8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8 ), align 2
+  %a9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 9 ), align 2
+  %a10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 10), align 2
+  %a11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 11), align 2
+  %a12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 12), align 2
+  %a13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 13), align 2
+  %a14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 14), align 2
+  %a15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 15), align 2
+  %a16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16), align 2
+  %a17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 17), align 2
+  %a18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 18), align 2
+  %a19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 19), align 2
+  %a20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 20), align 2
+  %a21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 21), align 2
+  %a22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 22), align 2
+  %a23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 23), align 2
+  %a24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24), align 2
+  %a25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 25), align 2
+  %a26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 26), align 2
+  %a27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 27), align 2
+  %a28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 28), align 2
+  %a29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 29), align 2
+  %a30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 30), align 2
+  %a31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 31), align 2
+  %b0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 0 ), align 2
+  %b1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 1 ), align 2
+  %b2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 2 ), align 2
+  %b3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 3 ), align 2
+  %b4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 4 ), align 2
+  %b5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 5 ), align 2
+  %b6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 6 ), align 2
+  %b7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 7 ), align 2
+  %b8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8 ), align 2
+  %b9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 9 ), align 2
+  %b10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 10), align 2
+  %b11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 11), align 2
+  %b12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 12), align 2
+  %b13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 13), align 2
+  %b14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 14), align 2
+  %b15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 15), align 2
+  %b16 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16), align 2
+  %b17 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 17), align 2
+  %b18 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 18), align 2
+  %b19 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 19), align 2
+  %b20 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 20), align 2
+  %b21 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 21), align 2
+  %b22 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 22), align 2
+  %b23 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 23), align 2
+  %b24 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24), align 2
+  %b25 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 25), align 2
+  %b26 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 26), align 2
+  %b27 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 27), align 2
+  %b28 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 28), align 2
+  %b29 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 29), align 2
+  %b30 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 30), align 2
+  %b31 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 31), align 2
+  %r0  = shl i16 %a0 , %b0
+  %r1  = shl i16 %a1 , %b1
+  %r2  = shl i16 %a2 , %b2
+  %r3  = shl i16 %a3 , %b3
+  %r4  = shl i16 %a4 , %b4
+  %r5  = shl i16 %a5 , %b5
+  %r6  = shl i16 %a6 , %b6
+  %r7  = shl i16 %a7 , %b7
+  %r8  = shl i16 %a8 , %b8
+  %r9  = shl i16 %a9 , %b9
+  %r10 = shl i16 %a10, %b10
+  %r11 = shl i16 %a11, %b11
+  %r12 = shl i16 %a12, %b12
+  %r13 = shl i16 %a13, %b13
+  %r14 = shl i16 %a14, %b14
+  %r15 = shl i16 %a15, %b15
+  %r16 = shl i16 %a16, %b16
+  %r17 = shl i16 %a17, %b17
+  %r18 = shl i16 %a18, %b18
+  %r19 = shl i16 %a19, %b19
+  %r20 = shl i16 %a20, %b20
+  %r21 = shl i16 %a21, %b21
+  %r22 = shl i16 %a22, %b22
+  %r23 = shl i16 %a23, %b23
+  %r24 = shl i16 %a24, %b24
+  %r25 = shl i16 %a25, %b25
+  %r26 = shl i16 %a26, %b26
+  %r27 = shl i16 %a27, %b27
+  %r28 = shl i16 %a28, %b28
+  %r29 = shl i16 %a29, %b29
+  %r30 = shl i16 %a30, %b30
+  %r31 = shl i16 %a31, %b31
+  store i16 %r0 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 0 ), align 2
+  store i16 %r1 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 1 ), align 2
+  store i16 %r2 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 2 ), align 2
+  store i16 %r3 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 3 ), align 2
+  store i16 %r4 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 4 ), align 2
+  store i16 %r5 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 5 ), align 2
+  store i16 %r6 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 6 ), align 2
+  store i16 %r7 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 7 ), align 2
+  store i16 %r8 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8 ), align 2
+  store i16 %r9 , i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 9 ), align 2
+  store i16 %r10, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 10), align 2
+  store i16 %r11, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 11), align 2
+  store i16 %r12, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 12), align 2
+  store i16 %r13, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 13), align 2
+  store i16 %r14, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 14), align 2
+  store i16 %r15, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 15), align 2
+  store i16 %r16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16), align 2
+  store i16 %r17, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 17), align 2
+  store i16 %r18, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 18), align 2
+  store i16 %r19, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 19), align 2
+  store i16 %r20, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 20), align 2
+  store i16 %r21, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 21), align 2
+  store i16 %r22, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 22), align 2
+  store i16 %r23, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 23), align 2
+  store i16 %r24, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24), align 2
+  store i16 %r25, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 25), align 2
+  store i16 %r26, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 26), align 2
+  store i16 %r27, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 27), align 2
+  store i16 %r28, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 28), align 2
+  store i16 %r29, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 29), align 2
+  store i16 %r30, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 30), align 2
+  store i16 %r31, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 31), align 2
+  ret void
+}
+
+define void @shl_v64i8() {
+; CHECK-LABEL: @shl_v64i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
+;
+  %a0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 0 ), align 1
+  %a1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 1 ), align 1
+  %a2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 2 ), align 1
+  %a3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 3 ), align 1
+  %a4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 4 ), align 1
+  %a5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 5 ), align 1
+  %a6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 6 ), align 1
+  %a7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 7 ), align 1
+  %a8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 8 ), align 1
+  %a9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 9 ), align 1
+  %a10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 10), align 1
+  %a11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 11), align 1
+  %a12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 12), align 1
+  %a13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 13), align 1
+  %a14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 14), align 1
+  %a15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 15), align 1
+  %a16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16), align 1
+  %a17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 17), align 1
+  %a18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 18), align 1
+  %a19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 19), align 1
+  %a20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 20), align 1
+  %a21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 21), align 1
+  %a22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 22), align 1
+  %a23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 23), align 1
+  %a24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 24), align 1
+  %a25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 25), align 1
+  %a26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 26), align 1
+  %a27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 27), align 1
+  %a28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 28), align 1
+  %a29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 29), align 1
+  %a30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 30), align 1
+  %a31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 31), align 1
+  %a32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32), align 1
+  %a33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 33), align 1
+  %a34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 34), align 1
+  %a35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 35), align 1
+  %a36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 36), align 1
+  %a37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 37), align 1
+  %a38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 38), align 1
+  %a39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 39), align 1
+  %a40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 40), align 1
+  %a41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 41), align 1
+  %a42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 42), align 1
+  %a43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 43), align 1
+  %a44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 44), align 1
+  %a45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 45), align 1
+  %a46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 46), align 1
+  %a47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 47), align 1
+  %a48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48), align 1
+  %a49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 49), align 1
+  %a50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 50), align 1
+  %a51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 51), align 1
+  %a52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 52), align 1
+  %a53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 53), align 1
+  %a54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 54), align 1
+  %a55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 55), align 1
+  %a56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 56), align 1
+  %a57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 57), align 1
+  %a58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 58), align 1
+  %a59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 59), align 1
+  %a60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 60), align 1
+  %a61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 61), align 1
+  %a62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 62), align 1
+  %a63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 63), align 1
+  %b0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 0 ), align 1
+  %b1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 1 ), align 1
+  %b2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 2 ), align 1
+  %b3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 3 ), align 1
+  %b4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 4 ), align 1
+  %b5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 5 ), align 1
+  %b6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 6 ), align 1
+  %b7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 7 ), align 1
+  %b8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 8 ), align 1
+  %b9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 9 ), align 1
+  %b10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 10), align 1
+  %b11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 11), align 1
+  %b12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 12), align 1
+  %b13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 13), align 1
+  %b14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 14), align 1
+  %b15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 15), align 1
+  %b16 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16), align 1
+  %b17 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 17), align 1
+  %b18 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 18), align 1
+  %b19 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 19), align 1
+  %b20 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 20), align 1
+  %b21 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 21), align 1
+  %b22 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 22), align 1
+  %b23 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 23), align 1
+  %b24 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 24), align 1
+  %b25 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 25), align 1
+  %b26 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 26), align 1
+  %b27 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 27), align 1
+  %b28 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 28), align 1
+  %b29 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 29), align 1
+  %b30 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 30), align 1
+  %b31 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 31), align 1
+  %b32 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32), align 1
+  %b33 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 33), align 1
+  %b34 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 34), align 1
+  %b35 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 35), align 1
+  %b36 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 36), align 1
+  %b37 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 37), align 1
+  %b38 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 38), align 1
+  %b39 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 39), align 1
+  %b40 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 40), align 1
+  %b41 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 41), align 1
+  %b42 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 42), align 1
+  %b43 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 43), align 1
+  %b44 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 44), align 1
+  %b45 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 45), align 1
+  %b46 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 46), align 1
+  %b47 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 47), align 1
+  %b48 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48), align 1
+  %b49 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 49), align 1
+  %b50 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 50), align 1
+  %b51 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 51), align 1
+  %b52 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 52), align 1
+  %b53 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 53), align 1
+  %b54 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 54), align 1
+  %b55 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 55), align 1
+  %b56 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 56), align 1
+  %b57 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 57), align 1
+  %b58 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 58), align 1
+  %b59 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 59), align 1
+  %b60 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 60), align 1
+  %b61 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 61), align 1
+  %b62 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 62), align 1
+  %b63 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 63), align 1
+  %r0  = shl i8 %a0 , %b0
+  %r1  = shl i8 %a1 , %b1
+  %r2  = shl i8 %a2 , %b2
+  %r3  = shl i8 %a3 , %b3
+  %r4  = shl i8 %a4 , %b4
+  %r5  = shl i8 %a5 , %b5
+  %r6  = shl i8 %a6 , %b6
+  %r7  = shl i8 %a7 , %b7
+  %r8  = shl i8 %a8 , %b8
+  %r9  = shl i8 %a9 , %b9
+  %r10 = shl i8 %a10, %b10
+  %r11 = shl i8 %a11, %b11
+  %r12 = shl i8 %a12, %b12
+  %r13 = shl i8 %a13, %b13
+  %r14 = shl i8 %a14, %b14
+  %r15 = shl i8 %a15, %b15
+  %r16 = shl i8 %a16, %b16
+  %r17 = shl i8 %a17, %b17
+  %r18 = shl i8 %a18, %b18
+  %r19 = shl i8 %a19, %b19
+  %r20 = shl i8 %a20, %b20
+  %r21 = shl i8 %a21, %b21
+  %r22 = shl i8 %a22, %b22
+  %r23 = shl i8 %a23, %b23
+  %r24 = shl i8 %a24, %b24
+  %r25 = shl i8 %a25, %b25
+  %r26 = shl i8 %a26, %b26
+  %r27 = shl i8 %a27, %b27
+  %r28 = shl i8 %a28, %b28
+  %r29 = shl i8 %a29, %b29
+  %r30 = shl i8 %a30, %b30
+  %r31 = shl i8 %a31, %b31
+  %r32 = shl i8 %a32, %b32
+  %r33 = shl i8 %a33, %b33
+  %r34 = shl i8 %a34, %b34
+  %r35 = shl i8 %a35, %b35
+  %r36 = shl i8 %a36, %b36
+  %r37 = shl i8 %a37, %b37
+  %r38 = shl i8 %a38, %b38
+  %r39 = shl i8 %a39, %b39
+  %r40 = shl i8 %a40, %b40
+  %r41 = shl i8 %a41, %b41
+  %r42 = shl i8 %a42, %b42
+  %r43 = shl i8 %a43, %b43
+  %r44 = shl i8 %a44, %b44
+  %r45 = shl i8 %a45, %b45
+  %r46 = shl i8 %a46, %b46
+  %r47 = shl i8 %a47, %b47
+  %r48 = shl i8 %a48, %b48
+  %r49 = shl i8 %a49, %b49
+  %r50 = shl i8 %a50, %b50
+  %r51 = shl i8 %a51, %b51
+  %r52 = shl i8 %a52, %b52
+  %r53 = shl i8 %a53, %b53
+  %r54 = shl i8 %a54, %b54
+  %r55 = shl i8 %a55, %b55
+  %r56 = shl i8 %a56, %b56
+  %r57 = shl i8 %a57, %b57
+  %r58 = shl i8 %a58, %b58
+  %r59 = shl i8 %a59, %b59
+  %r60 = shl i8 %a60, %b60
+  %r61 = shl i8 %a61, %b61
+  %r62 = shl i8 %a62, %b62
+  %r63 = shl i8 %a63, %b63
+  store i8 %r0 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 0 ), align 1
+  store i8 %r1 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 1 ), align 1
+  store i8 %r2 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 2 ), align 1
+  store i8 %r3 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 3 ), align 1
+  store i8 %r4 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 4 ), align 1
+  store i8 %r5 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 5 ), align 1
+  store i8 %r6 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 6 ), align 1
+  store i8 %r7 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 7 ), align 1
+  store i8 %r8 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 8 ), align 1
+  store i8 %r9 , i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 9 ), align 1
+  store i8 %r10, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 10), align 1
+  store i8 %r11, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 11), align 1
+  store i8 %r12, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 12), align 1
+  store i8 %r13, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 13), align 1
+  store i8 %r14, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 14), align 1
+  store i8 %r15, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 15), align 1
+  store i8 %r16, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16), align 1
+  store i8 %r17, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 17), align 1
+  store i8 %r18, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 18), align 1
+  store i8 %r19, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 19), align 1
+  store i8 %r20, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 20), align 1
+  store i8 %r21, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 21), align 1
+  store i8 %r22, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 22), align 1
+  store i8 %r23, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 23), align 1
+  store i8 %r24, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 24), align 1
+  store i8 %r25, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 25), align 1
+  store i8 %r26, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 26), align 1
+  store i8 %r27, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 27), align 1
+  store i8 %r28, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 28), align 1
+  store i8 %r29, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 29), align 1
+  store i8 %r30, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 30), align 1
+  store i8 %r31, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 31), align 1
+  store i8 %r32, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32), align 1
+  store i8 %r33, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 33), align 1
+  store i8 %r34, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 34), align 1
+  store i8 %r35, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 35), align 1
+  store i8 %r36, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 36), align 1
+  store i8 %r37, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 37), align 1
+  store i8 %r38, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 38), align 1
+  store i8 %r39, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 39), align 1
+  store i8 %r40, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 40), align 1
+  store i8 %r41, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 41), align 1
+  store i8 %r42, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 42), align 1
+  store i8 %r43, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 43), align 1
+  store i8 %r44, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 44), align 1
+  store i8 %r45, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 45), align 1
+  store i8 %r46, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 46), align 1
+  store i8 %r47, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 47), align 1
+  store i8 %r48, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48), align 1
+  store i8 %r49, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 49), align 1
+  store i8 %r50, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 50), align 1
+  store i8 %r51, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 51), align 1
+  store i8 %r52, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 52), align 1
+  store i8 %r53, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 53), align 1
+  store i8 %r54, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 54), align 1
+  store i8 %r55, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 55), align 1
+  store i8 %r56, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 56), align 1
+  store i8 %r57, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 57), align 1
+  store i8 %r58, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 58), align 1
+  store i8 %r59, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 59), align 1
+  store i8 %r60, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 60), align 1
+  store i8 %r61, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 61), align 1
+  store i8 %r62, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 62), align 1
+  store i8 %r63, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 63), align 1
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
new file mode 100644
index 00000000000..360ce6ab076
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sign-extend.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) {
+; CHECK-LABEL: @sign_extend_v_v(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[VECINIT9]]
+;
+entry:
+  %vecext = extractelement <4 x i16> %lhs, i32 0
+  %conv = sext i16 %vecext to i32
+  %vecinit = insertelement <4 x i32> undef, i32 %conv, i32 0
+  %vecext1 = extractelement <4 x i16> %lhs, i32 1
+  %conv2 = sext i16 %vecext1 to i32
+  %vecinit3 = insertelement <4 x i32> %vecinit, i32 %conv2, i32 1
+  %vecext4 = extractelement <4 x i16> %lhs, i32 2
+  %conv5 = sext i16 %vecext4 to i32
+  %vecinit6 = insertelement <4 x i32> %vecinit3, i32 %conv5, i32 2
+  %vecext7 = extractelement <4 x i16> %lhs, i32 3
+  %conv8 = sext i16 %vecext7 to i32
+  %vecinit9 = insertelement <4 x i32> %vecinit6, i32 %conv8, i32 3
+  ret <4 x i32> %vecinit9
+}
+
+define <4 x i16> @truncate_v_v(<4 x i32> %lhs) {
+; CHECK-LABEL: @truncate_v_v(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc <4 x i32> [[LHS:%.*]] to <4 x i16>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0
+; CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x i16> undef, i16 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1
+; CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x i16> [[VECINIT]], i16 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2
+; CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <4 x i16> [[VECINIT3]], i16 [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3
+; CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <4 x i16> [[VECINIT6]], i16 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i16> [[VECINIT9]]
+;
+entry:
+  %vecext = extractelement <4 x i32> %lhs, i32 0
+  %conv = trunc i32 %vecext to i16
+  %vecinit = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecext1 = extractelement <4 x i32> %lhs, i32 1
+  %conv2 = trunc i32 %vecext1 to i16
+  %vecinit3 = insertelement <4 x i16> %vecinit, i16 %conv2, i32 1
+  %vecext4 = extractelement <4 x i32> %lhs, i32 2
+  %conv5 = trunc i32 %vecext4 to i16
+  %vecinit6 = insertelement <4 x i16> %vecinit3, i16 %conv5, i32 2
+  %vecext7 = extractelement <4 x i32> %lhs, i32 3
+  %conv8 = trunc i32 %vecext7 to i16
+  %vecinit9 = insertelement <4 x i16> %vecinit6, i16 %conv8, i32 3
+  ret <4 x i16> %vecinit9
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
new file mode 100644
index 00000000000..59b94cad17e
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll
@@ -0,0 +1,152 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i64 %n) {
+; CHECK-LABEL: @rollable(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP10:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], <i32 7, i32 14, i32 21, i32 28>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10]] = add i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP10]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
+  %2 = shl i64 %i.019, 2
+  %3 = getelementptr inbounds i32, i32* %in, i64 %2
+  %4 = load i32, i32* %3, align 4
+  %5 = or i64 %2, 1
+  %6 = getelementptr inbounds i32, i32* %in, i64 %5
+  %7 = load i32, i32* %6, align 4
+  %8 = or i64 %2, 2
+  %9 = getelementptr inbounds i32, i32* %in, i64 %8
+  %10 = load i32, i32* %9, align 4
+  %11 = or i64 %2, 3
+  %12 = getelementptr inbounds i32, i32* %in, i64 %11
+  %13 = load i32, i32* %12, align 4
+  %14 = mul i32 %4, 7
+  %15 = add i32 %14, 7
+  %16 = mul i32 %7, 7
+  %17 = add i32 %16, 14
+  %18 = mul i32 %10, 7
+  %19 = add i32 %18, 21
+  %20 = mul i32 %13, 7
+  %21 = add i32 %20, 28
+  %22 = getelementptr inbounds i32, i32* %out, i64 %2
+  store i32 %15, i32* %22, align 4
+  %23 = getelementptr inbounds i32, i32* %out, i64 %5
+  store i32 %17, i32* %23, align 4
+  %24 = getelementptr inbounds i32, i32* %out, i64 %8
+  store i32 %19, i32* %24, align 4
+  %25 = getelementptr inbounds i32, i32* %out, i64 %11
+  store i32 %21, i32* %25, align 4
+  %26 = add i64 %i.019, 1
+  %exitcond = icmp eq i64 %26, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+define i32 @unrollable(i32* %in, i32* %out, i64 %n) nounwind ssp uwtable {
+; CHECK-LABEL: @unrollable(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[TMP26:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[I_019]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP4]], 7
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP14]], 7
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP7]], 7
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 14
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP10]], 7
+; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP18]], 21
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP13]], 7
+; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP20]], 28
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    store i32 [[TMP15]], i32* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP5]]
+; CHECK-NEXT:    store i32 [[TMP17]], i32* [[TMP23]], align 4
+; CHECK-NEXT:    [[BARRIER:%.*]] = call i32 @goo(i32 0)
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP8]]
+; CHECK-NEXT:    store i32 [[TMP19]], i32* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP11]]
+; CHECK-NEXT:    store i32 [[TMP21]], i32* [[TMP25]], align 4
+; CHECK-NEXT:    [[TMP26]] = add i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP26]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]]
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    ret i32 undef
+;
+  %1 = icmp eq i64 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %i.019 = phi i64 [ %26, %.lr.ph ], [ 0, %0 ]
+  %2 = shl i64 %i.019, 2
+  %3 = getelementptr inbounds i32, i32* %in, i64 %2
+  %4 = load i32, i32* %3, align 4
+  %5 = or i64 %2, 1
+  %6 = getelementptr inbounds i32, i32* %in, i64 %5
+  %7 = load i32, i32* %6, align 4
+  %8 = or i64 %2, 2
+  %9 = getelementptr inbounds i32, i32* %in, i64 %8
+  %10 = load i32, i32* %9, align 4
+  %11 = or i64 %2, 3
+  %12 = getelementptr inbounds i32, i32* %in, i64 %11
+  %13 = load i32, i32* %12, align 4
+  %14 = mul i32 %4, 7
+  %15 = add i32 %14, 7
+  %16 = mul i32 %7, 7
+  %17 = add i32 %16, 14
+  %18 = mul i32 %10, 7
+  %19 = add i32 %18, 21
+  %20 = mul i32 %13, 7
+  %21 = add i32 %20, 28
+  %22 = getelementptr inbounds i32, i32* %out, i64 %2
+  store i32 %15, i32* %22, align 4
+  %23 = getelementptr inbounds i32, i32* %out, i64 %5
+  store i32 %17, i32* %23, align 4
+  %barrier = call i32 @goo(i32 0)                      ; <---------------- memory barrier.
+  %24 = getelementptr inbounds i32, i32* %out, i64 %8
+  store i32 %19, i32* %24, align 4
+  %25 = getelementptr inbounds i32, i32* %out, i64 %11
+  store i32 %21, i32* %25, align 4
+  %26 = add i64 %i.019, 1
+  %exitcond = icmp eq i64 %26, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+declare i32 @goo(i32)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
new file mode 100644
index 00000000000..4ddc9cbe6fb
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Simple 3-pair chain with loads and stores, obfuscated with bitcasts
+define void @test2(double* %a, double* %b, i8* %e) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[C:%.*]] = bitcast i8* [[E:%.*]] to double*
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[C]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %c = bitcast i8* %e to double*
+  store double %mul, double* %c, align 8
+  %carrayidx5 = getelementptr inbounds i8, i8* %e, i64 8
+  %arrayidx5 = bitcast i8* %carrayidx5 to double*
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile loads.
+define void @test_volatile_load(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test_volatile_load(
+; CHECK-NEXT:    [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load volatile double, double* %a, align 8
+  %i1 = load volatile double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
+; Don't vectorize volatile stores.
+define void @test_volatile_store(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test_volatile_store(
+; CHECK-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store volatile double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store volatile double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    ret void
+;
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store volatile double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store volatile double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
new file mode 100644
index 00000000000..d4c4be5bbf6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll
@@ -0,0 +1,1262 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x i64] zeroinitializer, align 64
+@src32 = common global [16 x i32] zeroinitializer, align 64
+@src16 = common global [32 x i16] zeroinitializer, align 64
+@src8  = common global [64 x i8] zeroinitializer, align 64
+
+@dst64 = common global [8 x double] zeroinitializer, align 64
+@dst32 = common global [16 x float] zeroinitializer, align 64
+
+;
+; SITOFP to vXf64
+;
+
+define void @sitofp_2i64_2f64() #0 {
+; SSE-LABEL: @sitofp_2i64_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_2i64_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_2i64_2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_2i64_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = sitofp i64 %ld0 to double
+  %cvt1 = sitofp i64 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i64_4f64() #0 {
+; SSE-LABEL: @sitofp_4i64_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_4i64_4f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_4i64_4f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX512-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_4i64_4f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = sitofp i64 %ld0 to double
+  %cvt1 = sitofp i64 %ld1 to double
+  %cvt2 = sitofp i64 %ld2 to double
+  %cvt3 = sitofp i64 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i64_8f64() #0 {
+; SSE-LABEL: @sitofp_8i64_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_8i64_8f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to double
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to double
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to double
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to double
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to double
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to double
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i64_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_8i64_8f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256DQ-NEXT:    [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256DQ-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256DQ-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256DQ-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = sitofp i64 %ld0 to double
+  %cvt1 = sitofp i64 %ld1 to double
+  %cvt2 = sitofp i64 %ld2 to double
+  %cvt3 = sitofp i64 %ld3 to double
+  %cvt4 = sitofp i64 %ld4 to double
+  %cvt5 = sitofp i64 %ld5 to double
+  %cvt6 = sitofp i64 %ld6 to double
+  %cvt7 = sitofp i64 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sitofp_2i32_2f64() #0 {
+; CHECK-LABEL: @sitofp_2i32_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %cvt0 = sitofp i32 %ld0 to double
+  %cvt1 = sitofp i32 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i32_4f64() #0 {
+; SSE-LABEL: @sitofp_4i32_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_4i32_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = sitofp i32 %ld0 to double
+  %cvt1 = sitofp i32 %ld1 to double
+  %cvt2 = sitofp i32 %ld2 to double
+  %cvt3 = sitofp i32 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i32_8f64() #0 {
+; SSE-LABEL: @sitofp_8i32_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i32 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i32 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i32 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i32 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i32 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i32_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i32_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = sitofp i32 %ld0 to double
+  %cvt1 = sitofp i32 %ld1 to double
+  %cvt2 = sitofp i32 %ld2 to double
+  %cvt3 = sitofp i32 %ld3 to double
+  %cvt4 = sitofp i32 %ld4 to double
+  %cvt5 = sitofp i32 %ld5 to double
+  %cvt6 = sitofp i32 %ld6 to double
+  %cvt7 = sitofp i32 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sitofp_2i16_2f64() #0 {
+; CHECK-LABEL: @sitofp_2i16_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %cvt0 = sitofp i16 %ld0 to double
+  %cvt1 = sitofp i16 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i16_4f64() #0 {
+; SSE-LABEL: @sitofp_4i16_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_4i16_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = sitofp i16 %ld0 to double
+  %cvt1 = sitofp i16 %ld1 to double
+  %cvt2 = sitofp i16 %ld2 to double
+  %cvt3 = sitofp i16 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i16_8f64() #0 {
+; SSE-LABEL: @sitofp_8i16_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i16 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i16 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i16 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i16 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i16 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i16_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i16_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = sitofp i16 %ld0 to double
+  %cvt1 = sitofp i16 %ld1 to double
+  %cvt2 = sitofp i16 %ld2 to double
+  %cvt3 = sitofp i16 %ld3 to double
+  %cvt4 = sitofp i16 %ld4 to double
+  %cvt5 = sitofp i16 %ld5 to double
+  %cvt6 = sitofp i16 %ld6 to double
+  %cvt7 = sitofp i16 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @sitofp_2i8_2f64() #0 {
+; CHECK-LABEL: @sitofp_2i8_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i8 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %cvt0 = sitofp i8 %ld0 to double
+  %cvt1 = sitofp i8 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sitofp_4i8_4f64() #0 {
+; SSE-LABEL: @sitofp_4i8_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i8 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_4i8_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = sitofp i8 %ld0 to double
+  %cvt1 = sitofp i8 %ld1 to double
+  %cvt2 = sitofp i8 %ld2 to double
+  %cvt3 = sitofp i8 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sitofp_8i8_8f64() #0 {
+; SSE-LABEL: @sitofp_8i8_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i8 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i8 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i8 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i8 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i8 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_8i8_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i8_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = sitofp i8 %ld0 to double
+  %cvt1 = sitofp i8 %ld1 to double
+  %cvt2 = sitofp i8 %ld2 to double
+  %cvt3 = sitofp i8 %ld3 to double
+  %cvt4 = sitofp i8 %ld4 to double
+  %cvt5 = sitofp i8 %ld5 to double
+  %cvt6 = sitofp i8 %ld6 to double
+  %cvt7 = sitofp i8 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+;
+; SITOFP to vXf32
+;
+
+define void @sitofp_2i64_2f32() #0 {
+; CHECK-LABEL: @sitofp_2i64_2f32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = sitofp i64 %ld0 to float
+  %cvt1 = sitofp i64 %ld1 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  ret void
+}
+
+define void @sitofp_4i64_4f32() #0 {
+; SSE-LABEL: @sitofp_4i64_4f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_4i64_4f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_4i64_4f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_4i64_4f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = sitofp i64 %ld0 to float
+  %cvt1 = sitofp i64 %ld1 to float
+  %cvt2 = sitofp i64 %ld2 to float
+  %cvt3 = sitofp i64 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i64_8f32() #0 {
+; SSE-LABEL: @sitofp_8i64_8f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; SSE-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
+; SSE-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
+; SSE-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
+; SSE-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @sitofp_8i64_8f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = sitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = sitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = sitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = sitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = sitofp i64 [[LD4]] to float
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = sitofp i64 [[LD5]] to float
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = sitofp i64 [[LD6]] to float
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = sitofp i64 [[LD7]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_8i64_8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @sitofp_8i64_8f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = sitofp i64 %ld0 to float
+  %cvt1 = sitofp i64 %ld1 to float
+  %cvt2 = sitofp i64 %ld2 to float
+  %cvt3 = sitofp i64 %ld3 to float
+  %cvt4 = sitofp i64 %ld4 to float
+  %cvt5 = sitofp i64 %ld5 to float
+  %cvt6 = sitofp i64 %ld6 to float
+  %cvt7 = sitofp i64 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_4i32_4f32() #0 {
+; CHECK-LABEL: @sitofp_4i32_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = sitofp i32 %ld0 to float
+  %cvt1 = sitofp i32 %ld1 to float
+  %cvt2 = sitofp i32 %ld2 to float
+  %cvt3 = sitofp i32 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i32_8f32() #0 {
+; SSE-LABEL: @sitofp_8i32_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_8i32_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = sitofp i32 %ld0 to float
+  %cvt1 = sitofp i32 %ld1 to float
+  %cvt2 = sitofp i32 %ld2 to float
+  %cvt3 = sitofp i32 %ld3 to float
+  %cvt4 = sitofp i32 %ld4 to float
+  %cvt5 = sitofp i32 %ld5 to float
+  %cvt6 = sitofp i32 %ld6 to float
+  %cvt7 = sitofp i32 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_16i32_16f32() #0 {
+; SSE-LABEL: @sitofp_16i32_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_16i32_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_16i32_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i32> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
+  %ld1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8
+  %ld3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16
+  %ld5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8
+  %ld7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8 ), align 32
+  %ld9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 10), align 8
+  %ld11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 11), align 4
+  %ld12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12), align 16
+  %ld13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 13), align 4
+  %ld14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 14), align 8
+  %ld15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 15), align 4
+  %cvt0  = sitofp i32 %ld0  to float
+  %cvt1  = sitofp i32 %ld1  to float
+  %cvt2  = sitofp i32 %ld2  to float
+  %cvt3  = sitofp i32 %ld3  to float
+  %cvt4  = sitofp i32 %ld4  to float
+  %cvt5  = sitofp i32 %ld5  to float
+  %cvt6  = sitofp i32 %ld6  to float
+  %cvt7  = sitofp i32 %ld7  to float
+  %cvt8  = sitofp i32 %ld8  to float
+  %cvt9  = sitofp i32 %ld9  to float
+  %cvt10 = sitofp i32 %ld10 to float
+  %cvt11 = sitofp i32 %ld11 to float
+  %cvt12 = sitofp i32 %ld12 to float
+  %cvt13 = sitofp i32 %ld13 to float
+  %cvt14 = sitofp i32 %ld14 to float
+  %cvt15 = sitofp i32 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sitofp_4i16_4f32() #0 {
+; CHECK-LABEL: @sitofp_4i16_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = sitofp i16 %ld0 to float
+  %cvt1 = sitofp i16 %ld1 to float
+  %cvt2 = sitofp i16 %ld2 to float
+  %cvt3 = sitofp i16 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i16_8f32() #0 {
+; SSE-LABEL: @sitofp_8i16_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_8i16_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = sitofp i16 %ld0 to float
+  %cvt1 = sitofp i16 %ld1 to float
+  %cvt2 = sitofp i16 %ld2 to float
+  %cvt3 = sitofp i16 %ld3 to float
+  %cvt4 = sitofp i16 %ld4 to float
+  %cvt5 = sitofp i16 %ld5 to float
+  %cvt6 = sitofp i16 %ld6 to float
+  %cvt7 = sitofp i16 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_16i16_16f32() #0 {
+; SSE-LABEL: @sitofp_16i16_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_16i16_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_16i16_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
+  %ld1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4
+  %ld3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8
+  %ld5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4
+  %ld7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7 ), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8 ), align 16
+  %ld9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9 ), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+  %ld11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+  %ld13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+  %ld15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+  %cvt0  = sitofp i16 %ld0  to float
+  %cvt1  = sitofp i16 %ld1  to float
+  %cvt2  = sitofp i16 %ld2  to float
+  %cvt3  = sitofp i16 %ld3  to float
+  %cvt4  = sitofp i16 %ld4  to float
+  %cvt5  = sitofp i16 %ld5  to float
+  %cvt6  = sitofp i16 %ld6  to float
+  %cvt7  = sitofp i16 %ld7  to float
+  %cvt8  = sitofp i16 %ld8  to float
+  %cvt9  = sitofp i16 %ld9  to float
+  %cvt10 = sitofp i16 %ld10 to float
+  %cvt11 = sitofp i16 %ld11 to float
+  %cvt12 = sitofp i16 %ld12 to float
+  %cvt13 = sitofp i16 %ld13 to float
+  %cvt14 = sitofp i16 %ld14 to float
+  %cvt15 = sitofp i16 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @sitofp_4i8_4f32() #0 {
+; CHECK-LABEL: @sitofp_4i8_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = sitofp i8 %ld0 to float
+  %cvt1 = sitofp i8 %ld1 to float
+  %cvt2 = sitofp i8 %ld2 to float
+  %cvt3 = sitofp i8 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sitofp_8i8_8f32() #0 {
+; SSE-LABEL: @sitofp_8i8_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sitofp_8i8_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = sitofp i8 %ld0 to float
+  %cvt1 = sitofp i8 %ld1 to float
+  %cvt2 = sitofp i8 %ld2 to float
+  %cvt3 = sitofp i8 %ld3 to float
+  %cvt4 = sitofp i8 %ld4 to float
+  %cvt5 = sitofp i8 %ld5 to float
+  %cvt6 = sitofp i8 %ld6 to float
+  %cvt7 = sitofp i8 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sitofp_16i8_16f32() #0 {
+; SSE-LABEL: @sitofp_16i8_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sitofp_16i8_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sitofp_16i8_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = sitofp <16 x i8> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
+  %ld1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2
+  %ld3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4
+  %ld5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2
+  %ld7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7 ), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8 ), align 8
+  %ld9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 9 ), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 10), align 2
+  %ld11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12), align 4
+  %ld13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 14), align 2
+  %ld15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 15), align 1
+  %cvt0  = sitofp i8 %ld0  to float
+  %cvt1  = sitofp i8 %ld1  to float
+  %cvt2  = sitofp i8 %ld2  to float
+  %cvt3  = sitofp i8 %ld3  to float
+  %cvt4  = sitofp i8 %ld4  to float
+  %cvt5  = sitofp i8 %ld5  to float
+  %cvt6  = sitofp i8 %ld6  to float
+  %cvt7  = sitofp i8 %ld7  to float
+  %cvt8  = sitofp i8 %ld8  to float
+  %cvt9  = sitofp i8 %ld9  to float
+  %cvt10 = sitofp i8 %ld10 to float
+  %cvt11 = sitofp i8 %ld11 to float
+  %cvt12 = sitofp i8 %ld12 to float
+  %cvt13 = sitofp i8 %ld13 to float
+  %cvt14 = sitofp i8 %ld14 to float
+  %cvt15 = sitofp i8 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+;
+; SITOFP BUILDVECTOR
+;
+
+define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
+; CHECK-LABEL: @sitofp_4xi32_4f64(
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to double
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to double
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to double
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x double> undef, double [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x double> [[RES0]], double [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x double> [[RES1]], double [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x double> [[RES2]], double [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x double> [[RES3]]
+;
+  %cvt0 = sitofp i32 %a0 to double
+  %cvt1 = sitofp i32 %a1 to double
+  %cvt2 = sitofp i32 %a2 to double
+  %cvt3 = sitofp i32 %a3 to double
+  %res0 = insertelement <4 x double> undef, double %cvt0, i32 0
+  %res1 = insertelement <4 x double> %res0, double %cvt1, i32 1
+  %res2 = insertelement <4 x double> %res1, double %cvt2, i32 2
+  %res3 = insertelement <4 x double> %res2, double %cvt3, i32 3
+  ret <4 x double> %res3
+}
+
+define <4 x float> @sitofp_4xi32_4f32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 {
+; CHECK-LABEL: @sitofp_4xi32_4f32(
+; CHECK-NEXT:    [[CVT0:%.*]] = sitofp i32 [[A0:%.*]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = sitofp i32 [[A1:%.*]] to float
+; CHECK-NEXT:    [[CVT2:%.*]] = sitofp i32 [[A2:%.*]] to float
+; CHECK-NEXT:    [[CVT3:%.*]] = sitofp i32 [[A3:%.*]] to float
+; CHECK-NEXT:    [[RES0:%.*]] = insertelement <4 x float> undef, float [[CVT0]], i32 0
+; CHECK-NEXT:    [[RES1:%.*]] = insertelement <4 x float> [[RES0]], float [[CVT1]], i32 1
+; CHECK-NEXT:    [[RES2:%.*]] = insertelement <4 x float> [[RES1]], float [[CVT2]], i32 2
+; CHECK-NEXT:    [[RES3:%.*]] = insertelement <4 x float> [[RES2]], float [[CVT3]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[RES3]]
+;
+  %cvt0 = sitofp i32 %a0 to float
+  %cvt1 = sitofp i32 %a1 to float
+  %cvt2 = sitofp i32 %a2 to float
+  %cvt3 = sitofp i32 %a3 to float
+  %res0 = insertelement <4 x float> undef, float %cvt0, i32 0
+  %res1 = insertelement <4 x float> %res0, float %cvt1, i32 1
+  %res2 = insertelement <4 x float> %res1, float %cvt2, i32 2
+  %res3 = insertelement <4 x float> %res2, float %cvt3, i32 3
+  ret <4 x float> %res3
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
new file mode 100644
index 00000000000..6752641753f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+define dso_local void @rftbsub(double* %a) local_unnamed_addr #0 {
+; CHECK-LABEL: @rftbsub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 2, 1
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    [[ADD16:%.*]] = fadd double [[TMP2]], undef
+; CHECK-NEXT:    [[MUL18:%.*]] = fmul double undef, [[ADD16]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd double undef, [[MUL18]]
+; CHECK-NEXT:    [[SUB22:%.*]] = fsub double undef, undef
+; CHECK-NEXT:    [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]]
+; CHECK-NEXT:    store double [[SUB25]], double* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]]
+; CHECK-NEXT:    store double [[SUB29]], double* [[ARRAYIDX12]], align 8
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %arrayidx6 = getelementptr inbounds double, double* %a, i64 2
+  %0 = load double, double* %arrayidx6, align 8
+  %1 = or i64 2, 1
+  %arrayidx12 = getelementptr inbounds double, double* %a, i64 %1
+  %2 = load double, double* %arrayidx12, align 8
+  %add16 = fadd double %2, undef
+  %mul18 = fmul double undef, %add16
+  %add19 = fadd double undef, %mul18
+  %sub22 = fsub double undef, undef
+  %sub25 = fsub double %0, %add19
+  store double %sub25, double* %arrayidx6, align 8
+  %sub29 = fsub double %2, %sub22
+  store double %sub29, double* %arrayidx12, align 8
+  unreachable
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
new file mode 100644
index 00000000000..06d2134fb57
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x double] zeroinitializer, align 64
+@src32 = common global [16 x float] zeroinitializer, align 64
+@dst64 = common global [8 x double] zeroinitializer, align 64
+@dst32 = common global [16 x float] zeroinitializer, align 64
+
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+
+;
+; SQRT
+;
+
+define void @sqrt_2f64() #0 {
+; CHECK-LABEL: @sqrt_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %sqrt0 = call double @llvm.sqrt.f64(double %a0)
+  %sqrt1 = call double @llvm.sqrt.f64(double %a1)
+  store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @sqrt_4f64() #0 {
+; SSE-LABEL: @sqrt_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sqrt_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8
+; AVX-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8
+  %sqrt0 = call double @llvm.sqrt.f64(double %a0)
+  %sqrt1 = call double @llvm.sqrt.f64(double %a1)
+  %sqrt2 = call double @llvm.sqrt.f64(double %a2)
+  %sqrt3 = call double @llvm.sqrt.f64(double %a3)
+  store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 8
+  store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %sqrt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 8
+  store double %sqrt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @sqrt_8f64() #0 {
+; SSE-LABEL: @sqrt_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4]])
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sqrt_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP2]])
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sqrt_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[TMP1]])
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 4
+  %a1 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 4
+  %a2 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 4
+  %a3 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 4
+  %a4 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 4
+  %a5 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 4
+  %a6 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 4
+  %a7 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 4
+  %sqrt0 = call double @llvm.sqrt.f64(double %a0)
+  %sqrt1 = call double @llvm.sqrt.f64(double %a1)
+  %sqrt2 = call double @llvm.sqrt.f64(double %a2)
+  %sqrt3 = call double @llvm.sqrt.f64(double %a3)
+  %sqrt4 = call double @llvm.sqrt.f64(double %a4)
+  %sqrt5 = call double @llvm.sqrt.f64(double %a5)
+  %sqrt6 = call double @llvm.sqrt.f64(double %a6)
+  %sqrt7 = call double @llvm.sqrt.f64(double %a7)
+  store double %sqrt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 4
+  store double %sqrt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 4
+  store double %sqrt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 4
+  store double %sqrt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 4
+  store double %sqrt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 4
+  store double %sqrt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 4
+  store double %sqrt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 4
+  store double %sqrt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sqrt_4f32() #0 {
+; CHECK-LABEL: @sqrt_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; CHECK-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %sqrt0 = call float @llvm.sqrt.f32(float %a0)
+  %sqrt1 = call float @llvm.sqrt.f32(float %a1)
+  %sqrt2 = call float @llvm.sqrt.f32(float %a2)
+  %sqrt3 = call float @llvm.sqrt.f32(float %a3)
+  store float %sqrt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %sqrt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %sqrt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %sqrt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @sqrt_8f32() #0 {
+; SSE-LABEL: @sqrt_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @sqrt_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX-NEXT:    ret void
+;
+  %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4
+  %a1 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4
+  %a2 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4
+  %a3 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4
+  %a4 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4
+  %a5 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4
+  %a6 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4
+  %a7 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4
+  %sqrt0 = call float @llvm.sqrt.f32(float %a0)
+  %sqrt1 = call float @llvm.sqrt.f32(float %a1)
+  %sqrt2 = call float @llvm.sqrt.f32(float %a2)
+  %sqrt3 = call float @llvm.sqrt.f32(float %a3)
+  %sqrt4 = call float @llvm.sqrt.f32(float %a4)
+  %sqrt5 = call float @llvm.sqrt.f32(float %a5)
+  %sqrt6 = call float @llvm.sqrt.f32(float %a6)
+  %sqrt7 = call float @llvm.sqrt.f32(float %a7)
+  store float %sqrt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 4
+  store float %sqrt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %sqrt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 4
+  store float %sqrt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %sqrt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 4
+  store float %sqrt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %sqrt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 4
+  store float %sqrt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @sqrt_16f32() #0 {
+; SSE-LABEL: @sqrt_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
+; SSE-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]])
+; SSE-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]])
+; SSE-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4]])
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @sqrt_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
+; AVX256-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP2]])
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @sqrt_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* bitcast ([16 x float]* @src32 to <16 x float>*), align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[TMP1]])
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 4
+; AVX512-NEXT:    ret void
+;
+  %a0  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  0), align 4
+  %a1  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  1), align 4
+  %a2  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  2), align 4
+  %a3  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  3), align 4
+  %a4  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  4), align 4
+  %a5  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  5), align 4
+  %a6  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  6), align 4
+  %a7  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  7), align 4
+  %a8  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  8), align 4
+  %a9  = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64  9), align 4
+  %a10 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 10), align 4
+  %a11 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 11), align 4
+  %a12 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12), align 4
+  %a13 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 13), align 4
+  %a14 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 14), align 4
+  %a15 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 15), align 4
+  %sqrt0  = call float @llvm.sqrt.f32(float %a0 )
+  %sqrt1  = call float @llvm.sqrt.f32(float %a1 )
+  %sqrt2  = call float @llvm.sqrt.f32(float %a2 )
+  %sqrt3  = call float @llvm.sqrt.f32(float %a3 )
+  %sqrt4  = call float @llvm.sqrt.f32(float %a4 )
+  %sqrt5  = call float @llvm.sqrt.f32(float %a5 )
+  %sqrt6  = call float @llvm.sqrt.f32(float %a6 )
+  %sqrt7  = call float @llvm.sqrt.f32(float %a7 )
+  %sqrt8  = call float @llvm.sqrt.f32(float %a8 )
+  %sqrt9  = call float @llvm.sqrt.f32(float %a9 )
+  %sqrt10 = call float @llvm.sqrt.f32(float %a10)
+  %sqrt11 = call float @llvm.sqrt.f32(float %a11)
+  %sqrt12 = call float @llvm.sqrt.f32(float %a12)
+  %sqrt13 = call float @llvm.sqrt.f32(float %a13)
+  %sqrt14 = call float @llvm.sqrt.f32(float %a14)
+  %sqrt15 = call float @llvm.sqrt.f32(float %a15)
+  store float %sqrt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  0), align 4
+  store float %sqrt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  1), align 4
+  store float %sqrt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  2), align 4
+  store float %sqrt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  3), align 4
+  store float %sqrt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  4), align 4
+  store float %sqrt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  5), align 4
+  store float %sqrt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  6), align 4
+  store float %sqrt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  7), align 4
+  store float %sqrt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  8), align 4
+  store float %sqrt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64  9), align 4
+  store float %sqrt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 4
+  store float %sqrt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %sqrt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 4
+  store float %sqrt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %sqrt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 4
+  store float %sqrt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
new file mode 100644
index 00000000000..2255a12342f
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+
+
+define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
+; CHECK-LABEL: @jumbled-load(
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret i32 undef
+;
+  %in.addr = getelementptr inbounds i32, i32* %in, i64 0
+  %load.1 = load i32, i32* %in.addr, align 4
+  %gep.1 = getelementptr inbounds i32, i32* %in.addr, i64 1
+  %load.2 = load i32, i32* %gep.1, align 4
+  %gep.2 = getelementptr inbounds i32, i32* %in.addr, i64 2
+  %load.3 = load i32, i32* %gep.2, align 4
+  %gep.3 = getelementptr inbounds i32, i32* %in.addr, i64 3
+  %load.4 = load i32, i32* %gep.3, align 4
+  %inn.addr = getelementptr inbounds i32, i32* %inn, i64 0
+  %load.5 = load i32, i32* %inn.addr, align 4
+  %gep.4 = getelementptr inbounds i32, i32* %inn.addr, i64 1
+  %load.6 = load i32, i32* %gep.4, align 4
+  %gep.5 = getelementptr inbounds i32, i32* %inn.addr, i64 2
+  %load.7 = load i32, i32* %gep.5, align 4
+  %gep.6 = getelementptr inbounds i32, i32* %inn.addr, i64 3
+  %load.8 = load i32, i32* %gep.6, align 4
+  %mul.1 = mul i32 %load.1, %load.5
+  %mul.2 = mul i32 %load.2, %load.6
+  %mul.3 = mul i32 %load.3, %load.7
+  %mul.4 = mul i32 %load.4, %load.8
+  %gep.7 = getelementptr inbounds i32, i32* %out, i64 0
+  %gep.8 = getelementptr inbounds i32, i32* %out, i64 1
+  %gep.9 = getelementptr inbounds i32, i32* %out, i64 2
+  %gep.10 = getelementptr inbounds i32, i32* %out, i64 3
+  store i32 %mul.1, i32* %gep.9, align 4
+  store i32 %mul.2, i32* %gep.7, align 4
+  store i32 %mul.3, i32* %gep.10, align 4
+  store i32 %mul.4, i32* %gep.8, align 4
+
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
new file mode 100644
index 00000000000..48f2687b377
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll
@@ -0,0 +1,314 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
+
+;void Distance(float *p1, int p2, unsigned long p3[], float p4[]) {
+;  long a = p3[0] = 5;
+;  p1 += p2;
+;  p4[3] += p1[a];
+;  p3[0] >>= 5;
+;  p3[1] >>= 5;
+;  p3[2] >>= 5;
+;  p3[3] >>= 5;
+;  p1 += p2;
+;  p4[0] += p1[p3[0] & a];
+;}
+
+define void @_Z8DistanceIlLi5EEvPfiPmS0_(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @_Z8DistanceIlLi5EEvPfiPmS0_(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], <i64 5, i64 5, i64 5, i64 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[ADD_PTR11:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[TMP6]], 5
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds float, float* [[ADD_PTR11]], i64 [[AND]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX13]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[P4]], align 4
+; CHECK-NEXT:    [[ADD15:%.*]] = fadd float [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    store float [[ADD15]], float* [[P4]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i64 5, i64* %p3, align 8
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %0 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx2, align 4
+  %2 = load i64, i64* %p3, align 8
+  %shr = lshr i64 %2, 5
+  store i64 %shr, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %3, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %4, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %5, 5
+  store i64 %shr9, i64* %arrayidx8, align 8
+  %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext
+  %and = and i64 %shr, 5
+  %arrayidx13 = getelementptr inbounds float, float* %add.ptr11, i64 %and
+  %6 = load float, float* %arrayidx13, align 4
+  %7 = load float, float* %p4, align 4
+  %add15 = fadd float %6, %7
+  store float %add15, float* %p4, align 4
+  ret void
+}
+
+define void @store_reverse(i64* %p3) {
+; CHECK-LABEL: @store_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[P3:%.*]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>*
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], <4 x i64>* [[TMP7]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, i64* %p3, align 8
+  %arrayidx1 = getelementptr inbounds i64, i64* %p3, i64 8
+  %1 = load i64, i64* %arrayidx1, align 8
+  %shl = shl i64 %0, %1
+  %arrayidx2 = getelementptr inbounds i64, i64* %p3, i64 7
+  store i64 %shl, i64* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i64, i64* %p3, i64 1
+  %2 = load i64, i64* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 9
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shl5 = shl i64 %2, %3
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 6
+  store i64 %shl5, i64* %arrayidx6, align 8
+  %arrayidx7 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx7, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 10
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shl9 = shl i64 %4, %5
+  %arrayidx10 = getelementptr inbounds i64, i64* %p3, i64 5
+  store i64 %shl9, i64* %arrayidx10, align 8
+  %arrayidx11 = getelementptr inbounds i64, i64* %p3, i64 3
+  %6 = load i64, i64* %arrayidx11, align 8
+  %arrayidx12 = getelementptr inbounds i64, i64* %p3, i64 11
+  %7 = load i64, i64* %arrayidx12, align 8
+  %shl13 = shl i64 %6, %7
+  %arrayidx14 = getelementptr inbounds i64, i64* %p3, i64 4
+  store i64 %shl13, i64* %arrayidx14, align 8
+  ret void
+}
+
+define void @store15(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @store15(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
+; CHECK-NEXT:    store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i64 5, i64* %p3, align 8
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %0 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx2, align 4
+  %2 = load i64, i64* %p3, align 8
+  %shr = lshr i64 %2, 5
+  store i64 %shr, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %3, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %4, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %5, 5
+  %arrayidx9 = getelementptr inbounds i64, i64* %p3, i64 5
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 %shr9, i64* %arrayidx8, align 8
+  ret void
+}
+
+define void @store16(float* %p1, i32 %p2, i64* %p3, float* %p4) {
+; CHECK-LABEL: @store16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i64 5, i64* [[P3:%.*]], align 8
+; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[P2:%.*]] to i64
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[P1:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[P4:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP3]], <i64 5, i64 5>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[P3]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i64 [[TMP6]], 5
+; CHECK-NEXT:    store i64 [[SHR7]], i64* [[ARRAYIDX6]], align 8
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i64 [[TMP7]], 5
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 5, i64* [[ARRAYIDX9]], align 8
+; CHECK-NEXT:    store i64 [[SHR9]], i64* [[ARRAYIDX8]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i64 5, i64* %p3, align 8
+  %idx.ext = sext i32 %p2 to i64
+  %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext
+  %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5
+  %0 = load float, float* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3
+  %1 = load float, float* %arrayidx2, align 4
+  %add = fadd float %0, %1
+  store float %add, float* %arrayidx2, align 4
+  %2 = load i64, i64* %p3, align 8
+  %shr = lshr i64 %2, 5
+  store i64 %shr, i64* %p3, align 8
+  %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1
+  %3 = load i64, i64* %arrayidx4, align 8
+  %shr5 = lshr i64 %3, 5
+  store i64 %shr5, i64* %arrayidx4, align 8
+  %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2
+  %4 = load i64, i64* %arrayidx6, align 8
+  %shr7 = lshr i64 %4, 5
+  store i64 %shr7, i64* %arrayidx6, align 8
+  %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3
+  %5 = load i64, i64* %arrayidx8, align 8
+  %shr9 = lshr i64 %5, 5
+  %arrayidx9 = getelementptr inbounds i64, i64* %p3, i64 5
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 5, i64* %arrayidx9, align 8
+  store i64 %shr9, i64* %arrayidx8, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
new file mode 100644
index 00000000000..19f06805ff5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll
@@ -0,0 +1,268 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+define void @tiny_tree_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_fully_vectorizable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[SRC_ADDR_013]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[DST_ADDR_014]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP1]], <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]]
+; CHECK-NEXT:    [[ADD_PTR4]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 [[I_015]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double, double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %src.addr.013, i64 1
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %dst.addr.014, i64 1
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double, double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double, double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @tiny_tree_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_fully_vectorizable2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC_ADDR_021]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
+; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float, float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %src.addr.021, i64 1
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %src.addr.021, i64 2
+  %2 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float, float* %src.addr.021, i64 3
+  %3 = load float, float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float, float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float, float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float, float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; We do not vectorize the tiny tree which is not fully vectorizable.
+
+define void @tiny_tree_not_fully_vectorizable(double* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_not_fully_vectorizable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP12:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP12]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[SRC_ADDR_013]], align 8
+; CHECK-NEXT:    store double [[TMP0]], double* [[DST_ADDR_014]], align 8
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1
+; CHECK-NEXT:    store double [[TMP1]], double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]]
+; CHECK-NEXT:    [[ADD_PTR4]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 [[I_015]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_015]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp12 = icmp eq i64 %count, 0
+  br i1 %cmp12, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.015 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.014 = phi double* [ %add.ptr4, %for.body ], [ %dst, %entry ]
+  %src.addr.013 = phi double* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load double, double* %src.addr.013, align 8
+  store double %0, double* %dst.addr.014, align 8
+  %arrayidx2 = getelementptr inbounds double, double* %src.addr.013, i64 2
+  %1 = load double, double* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds double, double* %dst.addr.014, i64 1
+  store double %1, double* %arrayidx3, align 8
+  %add.ptr = getelementptr inbounds double, double* %src.addr.013, i64 %i.015
+  %add.ptr4 = getelementptr inbounds double, double* %dst.addr.014, i64 %i.015
+  %inc = add i64 %i.015, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @tiny_tree_not_fully_vectorizable2(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %count) #0 {
+; CHECK-LABEL: @tiny_tree_not_fully_vectorizable2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i64 [[COUNT:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP20]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST_ADDR_022]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1
+; CHECK-NEXT:    store float [[TMP1]], float* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2
+; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3
+; CHECK-NEXT:    store float [[TMP3]], float* [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]]
+; CHECK-NEXT:    [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]]
+; CHECK-NEXT:    [[INC]] = add i64 [[I_023]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp20 = icmp eq i64 %count, 0
+  br i1 %cmp20, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %dst.addr.022 = phi float* [ %add.ptr8, %for.body ], [ %dst, %entry ]
+  %src.addr.021 = phi float* [ %add.ptr, %for.body ], [ %src, %entry ]
+  %0 = load float, float* %src.addr.021, align 4
+  store float %0, float* %dst.addr.022, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %src.addr.021, i64 4
+  %1 = load float, float* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds float, float* %dst.addr.022, i64 1
+  store float %1, float* %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds float, float* %src.addr.021, i64 2
+  %2 = load float, float* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds float, float* %dst.addr.022, i64 2
+  store float %2, float* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds float, float* %src.addr.021, i64 3
+  %3 = load float, float* %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds float, float* %dst.addr.022, i64 3
+  store float %3, float* %arrayidx7, align 4
+  %add.ptr = getelementptr inbounds float, float* %src.addr.021, i64 %i.023
+  %add.ptr8 = getelementptr inbounds float, float* %dst.addr.022, i64 %i.023
+  %inc = add i64 %i.023, 1
+  %exitcond = icmp eq i64 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+define void @store_splat(float*, float) {
+; CHECK-LABEL: @store_splat(
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> undef, float [[TMP1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP1]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP10]], <4 x float>* [[TMP11]], align 4
+; CHECK-NEXT:    ret void
+;
+  %3 = getelementptr inbounds float, float* %0, i64 0
+  store float %1, float* %3, align 4
+  %4 = getelementptr inbounds float, float* %0, i64 1
+  store float %1, float* %4, align 4
+  %5 = getelementptr inbounds float, float* %0, i64 2
+  store float %1, float* %5, align 4
+  %6 = getelementptr inbounds float, float* %0, i64 3
+  store float %1, float* %6, align 4
+  ret void
+}
+
+define void @store_const(i32* %a) {
+; CHECK-LABEL: @store_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 0
+; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[PTR3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[PTR0]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> <i32 10, i32 30, i32 20, i32 40>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr0 = getelementptr inbounds i32, i32* %a, i64 0
+  store i32 10, i32* %ptr0, align 4
+  %ptr1 = getelementptr inbounds i32, i32* %a, i64 1
+  store i32 30, i32* %ptr1, align 4
+  %ptr2 = getelementptr inbounds i32, i32* %a, i64 2
+  store i32 20, i32* %ptr2, align 4
+  %ptr3 = getelementptr inbounds i32, i32* %a, i64 3
+  store i32 40, i32* %ptr3, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
new file mode 100644
index 00000000000..092918115eb
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll
@@ -0,0 +1,1164 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256NODQ
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX256 --check-prefix=AVX256DQ
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@src64 = common global [8 x i64] zeroinitializer, align 64
+@src32 = common global [16 x i32] zeroinitializer, align 64
+@src16 = common global [32 x i16] zeroinitializer, align 64
+@src8  = common global [64 x i8] zeroinitializer, align 64
+
+@dst64 = common global [8 x double] zeroinitializer, align 64
+@dst32 = common global [16 x float] zeroinitializer, align 64
+
+;
+; UITOFP to vXf64
+;
+
+define void @uitofp_2i64_2f64() #0 {
+; CHECK-LABEL: @uitofp_2i64_2f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = uitofp i64 %ld0 to double
+  %cvt1 = uitofp i64 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i64_4f64() #0 {
+; SSE-LABEL: @uitofp_4i64_4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i64_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = uitofp i64 %ld0 to double
+  %cvt1 = uitofp i64 %ld1 to double
+  %cvt2 = uitofp i64 %ld2 to double
+  %cvt3 = uitofp i64 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i64_8f64() #0 {
+; SSE-LABEL: @uitofp_8i64_8f64(
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double>
+; SSE-NEXT:    store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; SSE-NEXT:    store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16
+; SSE-NEXT:    store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32
+; SSE-NEXT:    store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i64_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i64_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = uitofp i64 %ld0 to double
+  %cvt1 = uitofp i64 %ld1 to double
+  %cvt2 = uitofp i64 %ld2 to double
+  %cvt3 = uitofp i64 %ld3 to double
+  %cvt4 = uitofp i64 %ld4 to double
+  %cvt5 = uitofp i64 %ld5 to double
+  %cvt6 = uitofp i64 %ld6 to double
+  %cvt7 = uitofp i64 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @uitofp_2i32_2f64() #0 {
+; SSE-LABEL: @uitofp_2i32_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_2i32_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_2i32_2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i32_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %cvt0 = uitofp i32 %ld0 to double
+  %cvt1 = uitofp i32 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i32_4f64() #0 {
+; SSE-LABEL: @uitofp_4i32_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i32 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i32_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = uitofp i32 %ld0 to double
+  %cvt1 = uitofp i32 %ld1 to double
+  %cvt2 = uitofp i32 %ld2 to double
+  %cvt3 = uitofp i32 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i32_8f64() #0 {
+; SSE-LABEL: @uitofp_8i32_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i32 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i32 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i32 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i32 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i32 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i32 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i32 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i32 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i32_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i32_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = uitofp i32 %ld0 to double
+  %cvt1 = uitofp i32 %ld1 to double
+  %cvt2 = uitofp i32 %ld2 to double
+  %cvt3 = uitofp i32 %ld3 to double
+  %cvt4 = uitofp i32 %ld4 to double
+  %cvt5 = uitofp i32 %ld5 to double
+  %cvt6 = uitofp i32 %ld6 to double
+  %cvt7 = uitofp i32 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @uitofp_2i16_2f64() #0 {
+; CHECK-LABEL: @uitofp_2i16_2f64(
+; CHECK-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
+; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
+; CHECK-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; CHECK-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %cvt0 = uitofp i16 %ld0 to double
+  %cvt1 = uitofp i16 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i16_4f64() #0 {
+; SSE-LABEL: @uitofp_4i16_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i16_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = uitofp i16 %ld0 to double
+  %cvt1 = uitofp i16 %ld1 to double
+  %cvt2 = uitofp i16 %ld2 to double
+  %cvt3 = uitofp i16 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i16_8f64() #0 {
+; SSE-LABEL: @uitofp_8i16_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i16 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i16 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i16 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i16 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i16 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i16 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i16 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i16 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i16_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i16_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = uitofp i16 %ld0 to double
+  %cvt1 = uitofp i16 %ld1 to double
+  %cvt2 = uitofp i16 %ld2 to double
+  %cvt3 = uitofp i16 %ld3 to double
+  %cvt4 = uitofp i16 %ld4 to double
+  %cvt5 = uitofp i16 %ld5 to double
+  %cvt6 = uitofp i16 %ld6 to double
+  %cvt7 = uitofp i16 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+define void @uitofp_2i8_2f64() #0 {
+; SSE-LABEL: @uitofp_2i8_2f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_2i8_2f64(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; AVX256NODQ-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_2i8_2f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; AVX512-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_2i8_2f64(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double>
+; AVX256DQ-NEXT:    store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %cvt0 = uitofp i8 %ld0 to double
+  %cvt1 = uitofp i8 %ld1 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  ret void
+}
+
+define void @uitofp_4i8_4f64() #0 {
+; SSE-LABEL: @uitofp_4i8_4f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i8 [[LD3]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_4i8_4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX-NEXT:    store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = uitofp i8 %ld0 to double
+  %cvt1 = uitofp i8 %ld1 to double
+  %cvt2 = uitofp i8 %ld2 to double
+  %cvt3 = uitofp i8 %ld3 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  ret void
+}
+
+define void @uitofp_8i8_8f64() #0 {
+; SSE-LABEL: @uitofp_8i8_8f64(
+; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i8 [[LD0]] to double
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i8 [[LD1]] to double
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i8 [[LD2]] to double
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i8 [[LD3]] to double
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i8 [[LD4]] to double
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i8 [[LD5]] to double
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i8 [[LD6]] to double
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i8 [[LD7]] to double
+; SSE-NEXT:    store double [[CVT0]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+; SSE-NEXT:    store double [[CVT1]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+; SSE-NEXT:    store double [[CVT2]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+; SSE-NEXT:    store double [[CVT3]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+; SSE-NEXT:    store double [[CVT4]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+; SSE-NEXT:    store double [[CVT5]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+; SSE-NEXT:    store double [[CVT6]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+; SSE-NEXT:    store double [[CVT7]], double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_8i8_8f64(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double>
+; AVX256-NEXT:    store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64
+; AVX256-NEXT:    store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i8_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x double>
+; AVX512-NEXT:    store <8 x double> [[TMP2]], <8 x double>* bitcast ([8 x double]* @dst64 to <8 x double>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = uitofp i8 %ld0 to double
+  %cvt1 = uitofp i8 %ld1 to double
+  %cvt2 = uitofp i8 %ld2 to double
+  %cvt3 = uitofp i8 %ld3 to double
+  %cvt4 = uitofp i8 %ld4 to double
+  %cvt5 = uitofp i8 %ld5 to double
+  %cvt6 = uitofp i8 %ld6 to double
+  %cvt7 = uitofp i8 %ld7 to double
+  store double %cvt0, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 0), align 64
+  store double %cvt1, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 1), align 8
+  store double %cvt2, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2), align 16
+  store double %cvt3, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 3), align 8
+  store double %cvt4, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4), align 32
+  store double %cvt5, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 5), align 8
+  store double %cvt6, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6), align 16
+  store double %cvt7, double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 7), align 8
+  ret void
+}
+
+;
+; UITOFP to vXf32
+;
+
+define void @uitofp_2i64_2f32() #0 {
+; CHECK-LABEL: @uitofp_2i64_2f32(
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; CHECK-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; CHECK-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; CHECK-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %cvt0 = uitofp i64 %ld0 to float
+  %cvt1 = uitofp i64 %ld1 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  ret void
+}
+
+define void @uitofp_4i64_4f32() #0 {
+; SSE-LABEL: @uitofp_4i64_4f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_4i64_4f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_4i64_4f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX512-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_4i64_4f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float>
+; AVX256DQ-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %cvt0 = uitofp i64 %ld0 to float
+  %cvt1 = uitofp i64 %ld1 to float
+  %cvt2 = uitofp i64 %ld2 to float
+  %cvt3 = uitofp i64 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i64_8f32() #0 {
+; SSE-LABEL: @uitofp_8i64_8f32(
+; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; SSE-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; SSE-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; SSE-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; SSE-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; SSE-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; SSE-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; SSE-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; SSE-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; SSE-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
+; SSE-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
+; SSE-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
+; SSE-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
+; SSE-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; SSE-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; SSE-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; SSE-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; SSE-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; SSE-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; SSE-NEXT:    ret void
+;
+; AVX256NODQ-LABEL: @uitofp_8i64_8f32(
+; AVX256NODQ-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+; AVX256NODQ-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+; AVX256NODQ-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+; AVX256NODQ-NEXT:    [[LD4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+; AVX256NODQ-NEXT:    [[LD5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+; AVX256NODQ-NEXT:    [[LD6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+; AVX256NODQ-NEXT:    [[LD7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+; AVX256NODQ-NEXT:    [[CVT0:%.*]] = uitofp i64 [[LD0]] to float
+; AVX256NODQ-NEXT:    [[CVT1:%.*]] = uitofp i64 [[LD1]] to float
+; AVX256NODQ-NEXT:    [[CVT2:%.*]] = uitofp i64 [[LD2]] to float
+; AVX256NODQ-NEXT:    [[CVT3:%.*]] = uitofp i64 [[LD3]] to float
+; AVX256NODQ-NEXT:    [[CVT4:%.*]] = uitofp i64 [[LD4]] to float
+; AVX256NODQ-NEXT:    [[CVT5:%.*]] = uitofp i64 [[LD5]] to float
+; AVX256NODQ-NEXT:    [[CVT6:%.*]] = uitofp i64 [[LD6]] to float
+; AVX256NODQ-NEXT:    [[CVT7:%.*]] = uitofp i64 [[LD7]] to float
+; AVX256NODQ-NEXT:    store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+; AVX256NODQ-NEXT:    store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+; AVX256NODQ-NEXT:    store float [[CVT2]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+; AVX256NODQ-NEXT:    store float [[CVT3]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+; AVX256NODQ-NEXT:    store float [[CVT4]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+; AVX256NODQ-NEXT:    store float [[CVT5]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+; AVX256NODQ-NEXT:    store float [[CVT6]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+; AVX256NODQ-NEXT:    store float [[CVT7]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+; AVX256NODQ-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_8i64_8f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX512-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+; AVX256DQ-LABEL: @uitofp_8i64_8f32(
+; AVX256DQ-NEXT:    [[TMP1:%.*]] = load <8 x i64>, <8 x i64>* bitcast ([8 x i64]* @src64 to <8 x i64>*), align 64
+; AVX256DQ-NEXT:    [[TMP2:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x float>
+; AVX256DQ-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256DQ-NEXT:    ret void
+;
+  %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64
+  %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8
+  %ld2 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2), align 16
+  %ld3 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 3), align 8
+  %ld4 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4), align 32
+  %ld5 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 5), align 8
+  %ld6 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6), align 16
+  %ld7 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 7), align 8
+  %cvt0 = uitofp i64 %ld0 to float
+  %cvt1 = uitofp i64 %ld1 to float
+  %cvt2 = uitofp i64 %ld2 to float
+  %cvt3 = uitofp i64 %ld3 to float
+  %cvt4 = uitofp i64 %ld4 to float
+  %cvt5 = uitofp i64 %ld5 to float
+  %cvt6 = uitofp i64 %ld6 to float
+  %cvt7 = uitofp i64 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_4i32_4f32() #0 {
+; CHECK-LABEL: @uitofp_4i32_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %cvt0 = uitofp i32 %ld0 to float
+  %cvt1 = uitofp i32 %ld1 to float
+  %cvt2 = uitofp i32 %ld2 to float
+  %cvt3 = uitofp i32 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i32_8f32() #0 {
+; SSE-LABEL: @uitofp_8i32_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_8i32_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0), align 64
+  %ld1 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1), align 4
+  %ld2 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2), align 8
+  %ld3 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3), align 4
+  %ld4 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4), align 16
+  %ld5 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5), align 4
+  %ld6 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6), align 8
+  %ld7 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7), align 4
+  %cvt0 = uitofp i32 %ld0 to float
+  %cvt1 = uitofp i32 %ld1 to float
+  %cvt2 = uitofp i32 %ld2 to float
+  %cvt3 = uitofp i32 %ld3 to float
+  %cvt4 = uitofp i32 %ld4 to float
+  %cvt5 = uitofp i32 %ld5 to float
+  %cvt6 = uitofp i32 %ld6 to float
+  %cvt7 = uitofp i32 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_16i32_16f32() #0 {
+; SSE-LABEL: @uitofp_16i32_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i32> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_16i32_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_16i32_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @src32 to <16 x i32>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i32> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 0 ), align 64
+  %ld1  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 1 ), align 4
+  %ld2  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2 ), align 8
+  %ld3  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 3 ), align 4
+  %ld4  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4 ), align 16
+  %ld5  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 5 ), align 4
+  %ld6  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6 ), align 8
+  %ld7  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 7 ), align 4
+  %ld8  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8 ), align 32
+  %ld9  = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 9 ), align 4
+  %ld10 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 10), align 8
+  %ld11 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 11), align 4
+  %ld12 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12), align 16
+  %ld13 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 13), align 4
+  %ld14 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 14), align 8
+  %ld15 = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 15), align 4
+  %cvt0  = uitofp i32 %ld0  to float
+  %cvt1  = uitofp i32 %ld1  to float
+  %cvt2  = uitofp i32 %ld2  to float
+  %cvt3  = uitofp i32 %ld3  to float
+  %cvt4  = uitofp i32 %ld4  to float
+  %cvt5  = uitofp i32 %ld5  to float
+  %cvt6  = uitofp i32 %ld6  to float
+  %cvt7  = uitofp i32 %ld7  to float
+  %cvt8  = uitofp i32 %ld8  to float
+  %cvt9  = uitofp i32 %ld9  to float
+  %cvt10 = uitofp i32 %ld10 to float
+  %cvt11 = uitofp i32 %ld11 to float
+  %cvt12 = uitofp i32 %ld12 to float
+  %cvt13 = uitofp i32 %ld13 to float
+  %cvt14 = uitofp i32 %ld14 to float
+  %cvt15 = uitofp i32 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @uitofp_4i16_4f32() #0 {
+; CHECK-LABEL: @uitofp_4i16_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %cvt0 = uitofp i16 %ld0 to float
+  %cvt1 = uitofp i16 %ld1 to float
+  %cvt2 = uitofp i16 %ld2 to float
+  %cvt3 = uitofp i16 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i16_8f32() #0 {
+; SSE-LABEL: @uitofp_8i16_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_8i16_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0), align 64
+  %ld1 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1), align 2
+  %ld2 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2), align 4
+  %ld3 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3), align 2
+  %ld4 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4), align 8
+  %ld5 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5), align 2
+  %ld6 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6), align 4
+  %ld7 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7), align 2
+  %cvt0 = uitofp i16 %ld0 to float
+  %cvt1 = uitofp i16 %ld1 to float
+  %cvt2 = uitofp i16 %ld2 to float
+  %cvt3 = uitofp i16 %ld3 to float
+  %cvt4 = uitofp i16 %ld4 to float
+  %cvt5 = uitofp i16 %ld5 to float
+  %cvt6 = uitofp i16 %ld6 to float
+  %cvt7 = uitofp i16 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_16i16_16f32() #0 {
+; SSE-LABEL: @uitofp_16i16_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_16i16_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_16i16_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @src16 to <16 x i16>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 0 ), align 64
+  %ld1  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 1 ), align 2
+  %ld2  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2 ), align 4
+  %ld3  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 3 ), align 2
+  %ld4  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4 ), align 8
+  %ld5  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 5 ), align 2
+  %ld6  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6 ), align 4
+  %ld7  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 7 ), align 2
+  %ld8  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8 ), align 16
+  %ld9  = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 9 ), align 2
+  %ld10 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 10), align 4
+  %ld11 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 11), align 2
+  %ld12 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12), align 8
+  %ld13 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 13), align 2
+  %ld14 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 14), align 4
+  %ld15 = load i16, i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 15), align 2
+  %cvt0  = uitofp i16 %ld0  to float
+  %cvt1  = uitofp i16 %ld1  to float
+  %cvt2  = uitofp i16 %ld2  to float
+  %cvt3  = uitofp i16 %ld3  to float
+  %cvt4  = uitofp i16 %ld4  to float
+  %cvt5  = uitofp i16 %ld5  to float
+  %cvt6  = uitofp i16 %ld6  to float
+  %cvt7  = uitofp i16 %ld7  to float
+  %cvt8  = uitofp i16 %ld8  to float
+  %cvt9  = uitofp i16 %ld9  to float
+  %cvt10 = uitofp i16 %ld10 to float
+  %cvt11 = uitofp i16 %ld11 to float
+  %cvt12 = uitofp i16 %ld12 to float
+  %cvt13 = uitofp i16 %ld13 to float
+  %cvt14 = uitofp i16 %ld14 to float
+  %cvt15 = uitofp i16 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+define void @uitofp_4i8_4f32() #0 {
+; CHECK-LABEL: @uitofp_4i8_4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; CHECK-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %cvt0 = uitofp i8 %ld0 to float
+  %cvt1 = uitofp i8 %ld1 to float
+  %cvt2 = uitofp i8 %ld2 to float
+  %cvt3 = uitofp i8 %ld3 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  ret void
+}
+
+define void @uitofp_8i8_8f32() #0 {
+; SSE-LABEL: @uitofp_8i8_8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX-LABEL: @uitofp_8i8_8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX-NEXT:    [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX-NEXT:    store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX-NEXT:    ret void
+;
+  %ld0 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0), align 64
+  %ld1 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1), align 1
+  %ld2 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2), align 2
+  %ld3 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3), align 1
+  %ld4 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4), align 4
+  %ld5 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5), align 1
+  %ld6 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6), align 2
+  %ld7 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7), align 1
+  %cvt0 = uitofp i8 %ld0 to float
+  %cvt1 = uitofp i8 %ld1 to float
+  %cvt2 = uitofp i8 %ld2 to float
+  %cvt3 = uitofp i8 %ld3 to float
+  %cvt4 = uitofp i8 %ld4 to float
+  %cvt5 = uitofp i8 %ld5 to float
+  %cvt6 = uitofp i8 %ld6 to float
+  %cvt7 = uitofp i8 %ld7 to float
+  store float %cvt0, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64
+  store float %cvt1, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4
+  store float %cvt2, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2), align 8
+  store float %cvt3, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3), align 4
+  store float %cvt4, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4), align 16
+  store float %cvt5, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5), align 4
+  store float %cvt6, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6), align 8
+  store float %cvt7, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7), align 4
+  ret void
+}
+
+define void @uitofp_16i8_16f32() #0 {
+; SSE-LABEL: @uitofp_16i8_16f32(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8
+; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4
+; SSE-NEXT:    [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float>
+; SSE-NEXT:    [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float>
+; SSE-NEXT:    [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float>
+; SSE-NEXT:    [[TMP8:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float>
+; SSE-NEXT:    store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64
+; SSE-NEXT:    store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16
+; SSE-NEXT:    store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32
+; SSE-NEXT:    store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16
+; SSE-NEXT:    ret void
+;
+; AVX256-LABEL: @uitofp_16i8_16f32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8
+; AVX256-NEXT:    [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float>
+; AVX256-NEXT:    [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float>
+; AVX256-NEXT:    store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64
+; AVX256-NEXT:    store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32
+; AVX256-NEXT:    ret void
+;
+; AVX512-LABEL: @uitofp_16i8_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @src8 to <16 x i8>*), align 64
+; AVX512-NEXT:    [[TMP2:%.*]] = uitofp <16 x i8> [[TMP1]] to <16 x float>
+; AVX512-NEXT:    store <16 x float> [[TMP2]], <16 x float>* bitcast ([16 x float]* @dst32 to <16 x float>*), align 64
+; AVX512-NEXT:    ret void
+;
+  %ld0  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 0 ), align 64
+  %ld1  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 1 ), align 1
+  %ld2  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2 ), align 2
+  %ld3  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 3 ), align 1
+  %ld4  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4 ), align 4
+  %ld5  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 5 ), align 1
+  %ld6  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6 ), align 2
+  %ld7  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 7 ), align 1
+  %ld8  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8 ), align 8
+  %ld9  = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 9 ), align 1
+  %ld10 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 10), align 2
+  %ld11 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 11), align 1
+  %ld12 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12), align 4
+  %ld13 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 13), align 1
+  %ld14 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 14), align 2
+  %ld15 = load i8, i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 15), align 1
+  %cvt0  = uitofp i8 %ld0  to float
+  %cvt1  = uitofp i8 %ld1  to float
+  %cvt2  = uitofp i8 %ld2  to float
+  %cvt3  = uitofp i8 %ld3  to float
+  %cvt4  = uitofp i8 %ld4  to float
+  %cvt5  = uitofp i8 %ld5  to float
+  %cvt6  = uitofp i8 %ld6  to float
+  %cvt7  = uitofp i8 %ld7  to float
+  %cvt8  = uitofp i8 %ld8  to float
+  %cvt9  = uitofp i8 %ld9  to float
+  %cvt10 = uitofp i8 %ld10 to float
+  %cvt11 = uitofp i8 %ld11 to float
+  %cvt12 = uitofp i8 %ld12 to float
+  %cvt13 = uitofp i8 %ld13 to float
+  %cvt14 = uitofp i8 %ld14 to float
+  %cvt15 = uitofp i8 %ld15 to float
+  store float %cvt0 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0 ), align 64
+  store float %cvt1 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1 ), align 4
+  store float %cvt2 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 2 ), align 8
+  store float %cvt3 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 3 ), align 4
+  store float %cvt4 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4 ), align 16
+  store float %cvt5 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 5 ), align 4
+  store float %cvt6 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 6 ), align 8
+  store float %cvt7 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 7 ), align 4
+  store float %cvt8 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8 ), align 32
+  store float %cvt9 , float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 9 ), align 4
+  store float %cvt10, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 10), align 8
+  store float %cvt11, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 11), align 4
+  store float %cvt12, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12), align 16
+  store float %cvt13, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 13), align 4
+  store float %cvt14, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 14), align 8
+  store float %cvt15, float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 15), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
new file mode 100644
index 00000000000..7ecd5805f93
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/undef_vect.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+
+%"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76" = type { i32, i32 }
+
+define void @_Z2azv() local_unnamed_addr {
+; CHECK-LABEL: @_Z2azv(
+; CHECK-NEXT:  for.body.lr.ph:
+; CHECK-NEXT:    [[DOTSROA_CAST_4:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_4:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 1
+; CHECK-NEXT:    [[DOTSROA_CAST_5:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_5:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 1
+; CHECK-NEXT:    [[DOTSROA_CAST_6:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_6:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 1
+; CHECK-NEXT:    [[DOTSROA_CAST_7:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 0
+; CHECK-NEXT:    [[DOTSROA_RAW_IDX_7:%.*]] = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[DOTSROA_CAST_4]] to <8 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[CMP_I1_4:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_4:%.*]] = select i1 [[CMP_I1_4]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP_I1_5:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_4]], undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_5:%.*]] = select i1 [[CMP_I1_5]], i32 undef, i32 [[DOTSROA_SPECULATED_4]]
+; CHECK-NEXT:    [[CMP_I1_6:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_5]], undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_6:%.*]] = select i1 [[CMP_I1_6]], i32 undef, i32 [[DOTSROA_SPECULATED_5]]
+; CHECK-NEXT:    [[CMP_I1_7:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_6]], undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_7:%.*]] = select i1 [[CMP_I1_7]], i32 undef, i32 [[DOTSROA_SPECULATED_6]]
+; CHECK-NEXT:    [[CMP_I1_8:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp sgt <8 x i32> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP1]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], undef
+; CHECK-NEXT:    [[OP_EXTRA:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 undef
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[OP_EXTRA]], undef
+; CHECK-NEXT:    [[OP_EXTRA7:%.*]] = select i1 [[TMP4]], i32 [[OP_EXTRA]], i32 undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_8:%.*]] = select i1 [[CMP_I1_8]], i32 undef, i32 undef
+; CHECK-NEXT:    [[DOTSROA_SPECULATED_9:%.*]] = select i1 undef, i32 undef, i32 [[OP_EXTRA7]]
+; CHECK-NEXT:    [[CMP_I1_10:%.*]] = icmp slt i32 [[DOTSROA_SPECULATED_9]], undef
+; CHECK-NEXT:    ret void
+;
+for.body.lr.ph:
+  %.sroa_cast.4 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 0
+  %retval.sroa.0.0.copyload.i5.4 = load i32, i32* %.sroa_cast.4, align 4
+  %.sroa_raw_idx.4 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 4, i32 1
+  %retval.sroa.0.0.copyload.i7.4 = load i32, i32* %.sroa_raw_idx.4, align 4
+  %cmp.i2.4 = icmp slt i32 %retval.sroa.0.0.copyload.i5.4, %retval.sroa.0.0.copyload.i7.4
+  %0 = select i1 %cmp.i2.4, i32 %retval.sroa.0.0.copyload.i7.4, i32 %retval.sroa.0.0.copyload.i5.4
+  %cmp.i1.4 = icmp slt i32 undef, %0
+  %.sroa.speculated.4 = select i1 %cmp.i1.4, i32 %0, i32 undef
+  %.sroa_cast.5 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 0
+  %retval.sroa.0.0.copyload.i5.5 = load i32, i32* %.sroa_cast.5, align 4
+  %.sroa_raw_idx.5 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 5, i32 1
+  %retval.sroa.0.0.copyload.i7.5 = load i32, i32* %.sroa_raw_idx.5, align 4
+  %cmp.i2.5 = icmp slt i32 %retval.sroa.0.0.copyload.i5.5, %retval.sroa.0.0.copyload.i7.5
+  %1 = select i1 %cmp.i2.5, i32 %retval.sroa.0.0.copyload.i7.5, i32 %retval.sroa.0.0.copyload.i5.5
+  %cmp.i1.5 = icmp slt i32 %.sroa.speculated.4, %1
+  %.sroa.speculated.5 = select i1 %cmp.i1.5, i32 %1, i32 %.sroa.speculated.4
+  %.sroa_cast.6 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 0
+  %retval.sroa.0.0.copyload.i5.6 = load i32, i32* %.sroa_cast.6, align 4
+  %.sroa_raw_idx.6 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 6, i32 1
+  %retval.sroa.0.0.copyload.i7.6 = load i32, i32* %.sroa_raw_idx.6, align 4
+  %cmp.i2.6 = icmp slt i32 %retval.sroa.0.0.copyload.i5.6, %retval.sroa.0.0.copyload.i7.6
+  %2 = select i1 %cmp.i2.6, i32 %retval.sroa.0.0.copyload.i7.6, i32 %retval.sroa.0.0.copyload.i5.6
+  %cmp.i1.6 = icmp slt i32 %.sroa.speculated.5, %2
+  %.sroa.speculated.6 = select i1 %cmp.i1.6, i32 %2, i32 %.sroa.speculated.5
+  %.sroa_cast.7 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 0
+  %retval.sroa.0.0.copyload.i5.7 = load i32, i32* %.sroa_cast.7, align 4
+  %.sroa_raw_idx.7 = getelementptr inbounds %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76", %"struct.std::h.0.4.8.12.16.20.24.28.248.0.1.2.3.76"* undef, i64 7, i32 1
+  %retval.sroa.0.0.copyload.i7.7 = load i32, i32* %.sroa_raw_idx.7, align 4
+  %cmp.i2.7 = icmp slt i32 %retval.sroa.0.0.copyload.i5.7, %retval.sroa.0.0.copyload.i7.7
+  %3 = select i1 %cmp.i2.7, i32 %retval.sroa.0.0.copyload.i7.7, i32 %retval.sroa.0.0.copyload.i5.7
+  %cmp.i1.7 = icmp slt i32 %.sroa.speculated.6, %3
+  %.sroa.speculated.7 = select i1 %cmp.i1.7, i32 %3, i32 %.sroa.speculated.6
+  %cmp.i1.8 = icmp slt i32 %.sroa.speculated.7, undef
+  %.sroa.speculated.8 = select i1 %cmp.i1.8, i32 undef, i32 %.sroa.speculated.7
+  %.sroa.speculated.9 = select i1 undef, i32 undef, i32 %.sroa.speculated.8
+  %cmp.i1.10 = icmp slt i32 %.sroa.speculated.9, undef
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll b/llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll
new file mode 100644
index 00000000000..cc1f1fca8ca
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/unreachable.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s
+
+; Check if the SLPVectorizer does not crash when handling
+; unreachable blocks with unscheduleable instructions.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define void @foo(i32* nocapture %x) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 4
+; CHECK-NEXT:    [[T4:%.*]] = load i32, i32* [[T3]], align 4
+; CHECK-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 5
+; CHECK-NEXT:    [[T6:%.*]] = load i32, i32* [[T5]], align 4
+; CHECK-NEXT:    [[BAD:%.*]] = fadd float [[BAD]], 0.000000e+00
+; CHECK-NEXT:    [[T7:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 6
+; CHECK-NEXT:    [[T8:%.*]] = load i32, i32* [[T7]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 7
+; CHECK-NEXT:    [[T10:%.*]] = load i32, i32* [[T9]], align 4
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[T1_0:%.*]] = phi i32 [ [[T4]], [[BB1:%.*]] ], [ 2, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[T2_0:%.*]] = phi i32 [ [[T6]], [[BB1]] ], [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    [[T3_0:%.*]] = phi i32 [ [[T8]], [[BB1]] ], [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    [[T4_0:%.*]] = phi i32 [ [[T10]], [[BB1]] ], [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    store i32 [[T1_0]], i32* [[X]], align 4
+; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 1
+; CHECK-NEXT:    store i32 [[T2_0]], i32* [[T12]], align 4
+; CHECK-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 2
+; CHECK-NEXT:    store i32 [[T3_0]], i32* [[T13]], align 4
+; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
+; CHECK-NEXT:    store i32 [[T4_0]], i32* [[T14]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb2
+
+bb1:                                    ; an unreachable block
+  %t3 = getelementptr inbounds i32, i32* %x, i64 4
+  %t4 = load i32, i32* %t3, align 4
+  %t5 = getelementptr inbounds i32, i32* %x, i64 5
+  %t6 = load i32, i32* %t5, align 4
+  %bad = fadd float %bad, 0.000000e+00  ; <- an instruction with self dependency,
+  ;    but legal in unreachable code
+  %t7 = getelementptr inbounds i32, i32* %x, i64 6
+  %t8 = load i32, i32* %t7, align 4
+  %t9 = getelementptr inbounds i32, i32* %x, i64 7
+  %t10 = load i32, i32* %t9, align 4
+  br label %bb2
+
+bb2:
+  %t1.0 = phi i32 [ %t4, %bb1 ], [ 2, %entry ]
+  %t2.0 = phi i32 [ %t6, %bb1 ], [ 2, %entry ]
+  %t3.0 = phi i32 [ %t8, %bb1 ], [ 2, %entry ]
+  %t4.0 = phi i32 [ %t10, %bb1 ], [ 2, %entry ]
+  store i32 %t1.0, i32* %x, align 4
+  %t12 = getelementptr inbounds i32, i32* %x, i64 1
+  store i32 %t2.0, i32* %t12, align 4
+  %t13 = getelementptr inbounds i32, i32* %x, i64 2
+  store i32 %t3.0, i32* %t13, align 4
+  %t14 = getelementptr inbounds i32, i32* %x, i64 3
+  store i32 %t4.0, i32* %t14, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
new file mode 100644
index 00000000000..78df5a1d68d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; We used to crash on this example because we were building a constant
+; expression during vectorization and the vectorizer expects instructions
+; as elements of the vectorized tree.
+; PR19621
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  bb279:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> undef, float undef, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float undef, i32 1
+; CHECK-NEXT:    br label [[BB283:%.*]]
+; CHECK:       bb283:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP13:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP1]], [[EXIT]] ]
+; CHECK-NEXT:    br label [[BB284:%.*]]
+; CHECK:       bb284:
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP5]], undef
+; CHECK-NEXT:    br label [[BB21_I:%.*]]
+; CHECK:       bb21.i:
+; CHECK-NEXT:    br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
+; CHECK:       bb22.i:
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> undef, [[TMP6]]
+; CHECK-NEXT:    br label [[BB32_I:%.*]]
+; CHECK:       bb32.i:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x double> [ [[TMP7]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
+; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
+; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
+; CHECK-NEXT:    br label [[BB283]]
+;
+bb279:
+  br label %bb283
+
+bb283:
+  %Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ]
+  %Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ]
+  %Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ]
+  %Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ]
+  br label %bb284
+
+bb284:
+  %tmp7.i = fpext float %Av.sroa.3.0 to double
+  %tmp8.i = fsub double %tmp7.i, undef
+  %tmp9.i = fsub double %tmp8.i, undef
+  %tmp17.i = fpext float %Av.sroa.8.0 to double
+  %tmp19.i = fsub double %tmp17.i, undef
+  %tmp20.i = fsub double %tmp19.i, undef
+  br label %bb21.i
+
+bb21.i:
+  br i1 undef, label %bb22.i, label %exit
+
+bb22.i:
+  %tmp24.i = fadd double undef, %tmp9.i
+  %tmp26.i = fadd double undef, %tmp20.i
+  br label %bb32.i
+
+bb32.i:
+  %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+  %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ]
+  br i1 undef, label %bb32.i, label %bb21.i
+
+exit:
+  %tmp303 = fpext float %Av.sroa.0.0 to double
+  %tmp304 = fmul double %tmp303, undef
+  %tmp305 = fadd double undef, %tmp304
+  %tmp306 = fadd double %tmp305, undef
+  %tmp307 = fptrunc double %tmp306 to float
+  %tmp311 = fpext float %Av.sroa.5.0 to double
+  %tmp312 = fmul double %tmp311, 0.000000e+00
+  %tmp313 = fadd double undef, %tmp312
+  %tmp314 = fadd double %tmp313, undef
+  %tmp315 = fptrunc double %tmp314 to float
+  %tmp317 = fptrunc double undef to float
+  %tmp319 = fptrunc double undef to float
+  br label %bb283
+}
+
+; Make sure that we probably handle constant folded vectorized trees. The
+; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree.
+; The code that handles insertelement instructions must handle this.
+define <4 x double> @constant_folding() {
+; CHECK-LABEL: @constant_folding(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double 1.000000e+00, i32 1
+; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x double> [[I1]], double 2.000000e+00, i32 0
+; CHECK-NEXT:    ret <4 x double> [[I2]]
+;
+entry:
+  %t0 = fadd double 1.000000e+00 , 0.000000e+00
+  %t1 = fadd double 1.000000e+00 , 1.000000e+00
+  %t2 = fmul double %t0, 1.000000e+00
+  %i1 = insertelement <4 x double> undef, double %t2, i32 1
+  %t3 = fmul double %t1, 1.000000e+00
+  %i2 = insertelement <4 x double> %i1, double %t3, i32 0
+  ret <4 x double> %i2
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
new file mode 100644
index 00000000000..8bf3f362f03
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -0,0 +1,984 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+define void @add0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @add0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 1, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %add = add nsw i32 %0, 1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %add, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %add3 = add nsw i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %add3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %add6 = add nsw i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %add6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %add9 = add nsw i32 %3, 3
+  store i32 %add9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @add1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %0, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %add3 = add nsw i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %add3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %add6 = add nsw i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %add6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %add9 = add nsw i32 %3, 3
+  store i32 %add9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = add nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 4, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %add = add nsw i32 %0, 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %add, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub = add nsw i32 %1, -1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = add nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @sub2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub3 = add nsw i32 %1, -1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub6 = add nsw i32 %2, -2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %sub9 = add nsw i32 %3, -3
+  store i32 %sub9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @addsub0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @addsub0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %sub5 = add nsw i32 %2, -2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %sub5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = sub nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @addsub1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @addsub1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[TMP0]], -1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3
+; CHECK-NEXT:    store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %sub = add nsw i32 %0, -1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %sub, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %sub1 = sub nsw i32 %1, -1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %sub1, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %2, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %sub8 = sub nsw i32 %3, -3
+  store i32 %sub8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @mul(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9
+; CHECK-NEXT:    store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %mul = mul nsw i32 %0, 257
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %mul, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %mul3 = mul nsw i32 %1, -3
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %mul3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %2, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %mul9 = mul nsw i32 %3, -9
+  store i32 %mul9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @shl0(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @shl0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store i32 [[TMP0]], i32* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP1]], 1
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SHL5:%.*]] = shl i32 [[TMP2]], 2
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
+; CHECK-NEXT:    store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %0, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %shl = shl i32 %1, 1
+  %incdec.ptr3 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %shl, i32* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %shl5 = shl i32 %2, 2
+  %incdec.ptr6 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %shl5, i32* %incdec.ptr3, align 4
+  %3 = load i32, i32* %incdec.ptr4, align 4
+  %shl8 = shl i32 %3, 3
+  store i32 %shl8, i32* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @shl1(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @shl1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], <i32 7, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %src, i64 1
+  %0 = load i32, i32* %src, align 4
+  %shl = shl i32 %0, 7
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %dst, i64 1
+  store i32 %shl, i32* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds i32, i32* %src, i64 2
+  %1 = load i32, i32* %incdec.ptr, align 4
+  %shl3 = shl i32 %1, 1
+  %incdec.ptr4 = getelementptr inbounds i32, i32* %dst, i64 2
+  store i32 %shl3, i32* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds i32, i32* %src, i64 3
+  %2 = load i32, i32* %incdec.ptr2, align 4
+  %shl6 = shl i32 %2, 2
+  %incdec.ptr7 = getelementptr inbounds i32, i32* %dst, i64 3
+  store i32 %shl6, i32* %incdec.ptr4, align 4
+  %3 = load i32, i32* %incdec.ptr5, align 4
+  %shl9 = shl i32 %3, 3
+  store i32 %shl9, i32* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, 1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd fast float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %0, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd fast float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd fast float %3, -3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, 4.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub = fadd fast float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fadd fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub2f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fadd fast float %1, -1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub6 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fadd fast float %3, -3.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @addsub0f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @addsub0f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[SUB5]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd fast float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fsub fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @addsub1f(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @addsub1f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[SUB8]], float* [[INCDEC_PTR6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub1 = fsub fast float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub1, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fsub fast float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @mulf(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @mulf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fmul fast float %0, 2.570000e+02
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fmul fast float %1, -3.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fmul fast float %3, -9.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add0fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add0fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd float %0, 1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @add1fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @add1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[TMP0]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[ADD3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %0, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %add3 = fadd float %1, 1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %add3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, 2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, 3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub0fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub0fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[ADD]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[TMP1]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[ADD6]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00
+; CHECK-NEXT:    store float [[ADD9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd fast float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %1, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %add6 = fadd float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %add6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %add9 = fadd float %3, -3.000000e+00
+  store float %add9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @sub1fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub1fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %add = fadd float %0, 4.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %add, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub = fadd float %1, -1.000000e+00
+  %incdec.ptr3 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub, float* %incdec.ptr1, align 4
+  %incdec.ptr4 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub5 = fadd float %2, -2.000000e+00
+  %incdec.ptr6 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub5, float* %incdec.ptr3, align 4
+  %3 = load float, float* %incdec.ptr4, align 4
+  %sub8 = fadd float %3, -3.000000e+00
+  store float %sub8, float* %incdec.ptr6, align 4
+  ret void
+}
+
+define void @sub2fn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @sub2fn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fadd float %0, -1.000000e+00
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fadd float %1, -1.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %sub6 = fadd float %2, -2.000000e+00
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %sub6, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fadd float %3, -3.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
+
+define void @mulfn(float* noalias %dst, float* noalias %src) {
+; CHECK-LABEL: @mulfn(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[SRC]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1
+; CHECK-NEXT:    store float [[SUB]], float* [[DST]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2
+; CHECK-NEXT:    store float [[SUB3]], float* [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
+; CHECK-NEXT:    store float [[TMP2]], float* [[INCDEC_PTR4]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4
+; CHECK-NEXT:    [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00
+; CHECK-NEXT:    store float [[SUB9]], float* [[INCDEC_PTR7]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds float, float* %src, i64 1
+  %0 = load float, float* %src, align 4
+  %sub = fmul float %0, 2.570000e+02
+  %incdec.ptr1 = getelementptr inbounds float, float* %dst, i64 1
+  store float %sub, float* %dst, align 4
+  %incdec.ptr2 = getelementptr inbounds float, float* %src, i64 2
+  %1 = load float, float* %incdec.ptr, align 4
+  %sub3 = fmul float %1, -3.000000e+00
+  %incdec.ptr4 = getelementptr inbounds float, float* %dst, i64 2
+  store float %sub3, float* %incdec.ptr1, align 4
+  %incdec.ptr5 = getelementptr inbounds float, float* %src, i64 3
+  %2 = load float, float* %incdec.ptr2, align 4
+  %incdec.ptr7 = getelementptr inbounds float, float* %dst, i64 3
+  store float %2, float* %incdec.ptr4, align 4
+  %3 = load float, float* %incdec.ptr5, align 4
+  %sub9 = fmul fast float %3, -9.000000e+00
+  store float %sub9, float* %incdec.ptr7, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector.ll
new file mode 100644
index 00000000000..e1f3fa50ccd
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vector.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Make sure that we are not crashing or changing the code.
+define void @test(<4 x i32> %in, <4 x i32> %in2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[K:%.*]] = icmp eq <4 x i32> [[IN:%.*]], [[IN2:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %k = icmp eq <4 x i32> %in, %in2
+  ret void
+}
+
+define i1 @cmpv2f32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @cmpv2f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[X0]], [[Y0]]
+; CHECK-NEXT:    br i1 [[CMP0]], label [[IF:%.*]], label [[ENDIF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X1]], [[Y1]]
+; CHECK-NEXT:    br label [[ENDIF]]
+; CHECK:       endif:
+; CHECK-NEXT:    [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP1]], [[IF]] ]
+; CHECK-NEXT:    ret i1 [[AND_OF_CMPS]]
+;
+  entry:
+  %x0 = extractelement <2 x i32> %x, i32 0
+  %y0 = extractelement <2 x i32> %y, i32 0
+  %cmp0 = icmp eq i32 %x0, %y0
+  br i1 %cmp0, label %if, label %endif
+
+  if:
+  %x1 = extractelement <2 x i32> %x, i32 1
+  %y1 = extractelement <2 x i32> %y, i32 1
+  %cmp1 = icmp eq i32 %x1, %y1
+  br label %endif
+
+  endif:
+  %and_of_cmps = phi i1 [ false, %entry ], [ %cmp1, %if ]
+  ret i1 %and_of_cmps
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
new file mode 100644
index 00000000000..436f091d724
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that SLP vectorizer does not fail on vector GEP.
+; The GEP has scalar and vector parameters and returns vector of pointers.
+
+; Function Attrs: noreturn readonly uwtable
+define void @_Z3fn1v(i32 %x, <16 x i32*>%y) local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z3fn1v(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV42_LE:%.*]] = sext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[CONV36109_LE:%.*]] = zext i32 2 to i64
+; CHECK-NEXT:    [[VECTORGEP:%.*]] = getelementptr i32, <16 x i32*> [[Y:%.*]], i64 [[CONV36109_LE]]
+; CHECK-NEXT:    [[VECTORGEP208:%.*]] = getelementptr i32, <16 x i32*> [[Y]], i64 [[CONV42_LE]]
+; CHECK-NEXT:    unreachable
+;
+
+entry:
+  %conv42.le = sext i32 %x to i64
+  %conv36109.le = zext i32 2 to i64
+  %VectorGep = getelementptr i32, <16 x i32*> %y, i64 %conv36109.le
+  %VectorGep208 = getelementptr i32, <16 x i32*> %y, i64 %conv42.le
+  unreachable
+}
+
+attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
new file mode 100644
index 00000000000..889bba80b7a
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s
+
+define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
+; CHECK-NEXT:    [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 undef
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[COND19]], undef
+; CHECK-NEXT:    [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 undef
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp ult i32 [[COND24]], undef
+; CHECK-NEXT:    [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 undef
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp ult i32 [[COND29]], undef
+; CHECK-NEXT:    [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 undef
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[COND34]], undef
+; CHECK-NEXT:    [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 undef
+; CHECK-NEXT:    [[CMP40:%.*]] = icmp ult i32 [[COND39]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %a1
+  %add2 = add i32 %0, %a2
+  %add4 = add i32 %0, %a3
+  %add6 = add i32 %0, %a4
+  %add8 = add i32 %0, %a5
+  %add10 = add i32 %0, %a6
+  %1 = load i32, i32* %arr, align 4
+  %add12 = add i32 %1, %a7
+  %add14 = add i32 %1, %a8
+  %cmp = icmp ult i32 %add, %add2
+  %cond = select i1 %cmp, i32 %add, i32 %add2
+  %cmp15 = icmp ult i32 %cond, %add4
+  %cond19 = select i1 %cmp15, i32 %cond, i32 %add4
+  %cmp20 = icmp ult i32 %cond19, %add6
+  %cond24 = select i1 %cmp20, i32 %cond19, i32 %add6
+  %cmp25 = icmp ult i32 %cond24, %add8
+  %cond29 = select i1 %cmp25, i32 %cond24, i32 %add8
+  %cmp30 = icmp ult i32 %cond29, %add10
+  %cond34 = select i1 %cmp30, i32 %cond29, i32 %add10
+  %cmp35 = icmp ult i32 %cond34, %add12
+  %cond39 = select i1 %cmp35, i32 %cond34, i32 %add12
+  %cmp40 = icmp ult i32 %cond39, %add14
+  %cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
+  ret i32 %cond44
+}
+
+define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: @foo1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 0, i32 3, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
+; CHECK-NEXT:    [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 undef
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[COND19]], undef
+; CHECK-NEXT:    [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 undef
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp ult i32 [[COND24]], undef
+; CHECK-NEXT:    [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 undef
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp ult i32 [[COND29]], undef
+; CHECK-NEXT:    [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 undef
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[COND34]], undef
+; CHECK-NEXT:    [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 undef
+; CHECK-NEXT:    [[CMP40:%.*]] = icmp ult i32 [[COND39]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 1
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %a1
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add2 = add i32 %1, %a2
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i64 3
+  %2 = load i32, i32* %arrayidx3, align 4
+  %add4 = add i32 %2, %a3
+  %add6 = add i32 %0, %a4
+  %add8 = add i32 %0, %a5
+  %3 = load i32, i32* %arr, align 4
+  %add10 = add i32 %3, %a6
+  %add12 = add i32 %1, %a7
+  %add14 = add i32 %0, %a8
+  %cmp = icmp ult i32 %add, %add2
+  %cond = select i1 %cmp, i32 %add, i32 %add2
+  %cmp15 = icmp ult i32 %cond, %add4
+  %cond19 = select i1 %cmp15, i32 %cond, i32 %add4
+  %cmp20 = icmp ult i32 %cond19, %add6
+  %cond24 = select i1 %cmp20, i32 %cond19, i32 %add6
+  %cmp25 = icmp ult i32 %cond24, %add8
+  %cond29 = select i1 %cmp25, i32 %cond24, i32 %add8
+  %cmp30 = icmp ult i32 %cond29, %add10
+  %cond34 = select i1 %cmp30, i32 %cond29, i32 %add10
+  %cmp35 = icmp ult i32 %cond34, %add12
+  %cond39 = select i1 %cmp35, i32 %cond34, i32 %add12
+  %cmp40 = icmp ult i32 %cond39, %add14
+  %cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
+  ret i32 %cond44
+}
+
+define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) {
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 2, i32 3, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
+; CHECK-NEXT:    [[COND19:%.*]] = select i1 [[CMP15]], i32 [[COND]], i32 undef
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp ult i32 [[COND19]], undef
+; CHECK-NEXT:    [[COND24:%.*]] = select i1 [[CMP20]], i32 [[COND19]], i32 undef
+; CHECK-NEXT:    [[CMP25:%.*]] = icmp ult i32 [[COND24]], undef
+; CHECK-NEXT:    [[COND29:%.*]] = select i1 [[CMP25]], i32 [[COND24]], i32 undef
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp ult i32 [[COND29]], undef
+; CHECK-NEXT:    [[COND34:%.*]] = select i1 [[CMP30]], i32 [[COND29]], i32 undef
+; CHECK-NEXT:    [[CMP35:%.*]] = icmp ult i32 [[COND34]], undef
+; CHECK-NEXT:    [[COND39:%.*]] = select i1 [[CMP35]], i32 [[COND34]], i32 undef
+; CHECK-NEXT:    [[CMP40:%.*]] = icmp ult i32 [[COND39]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i32> [[TMP10]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i32> [[TMP10]], <8 x i32> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i32> [[RDX_MINMAX_SELECT]], <8 x i32> [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i32> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i32> [[RDX_MINMAX_SELECT3]], <8 x i32> [[RDX_SHUF4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[RDX_MINMAX_SELECT6]], i32 0
+; CHECK-NEXT:    [[COND44:%.*]] = select i1 [[CMP40]], i32 [[COND39]], i32 undef
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 3
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %a1
+  %arrayidx1 = getelementptr inbounds i32, i32* %arr, i64 2
+  %1 = load i32, i32* %arrayidx1, align 4
+  %add2 = add i32 %1, %a2
+  %add4 = add i32 %0, %a3
+  %2 = load i32, i32* %arr, align 4
+  %add6 = add i32 %2, %a4
+  %arrayidx7 = getelementptr inbounds i32, i32* %arr, i64 1
+  %3 = load i32, i32* %arrayidx7, align 4
+  %add8 = add i32 %3, %a5
+  %add10 = add i32 %2, %a6
+  %add12 = add i32 %1, %a7
+  %add14 = add i32 %3, %a8
+  %cmp = icmp ult i32 %add, %add2
+  %cond = select i1 %cmp, i32 %add, i32 %add2
+  %cmp15 = icmp ult i32 %cond, %add4
+  %cond19 = select i1 %cmp15, i32 %cond, i32 %add4
+  %cmp20 = icmp ult i32 %cond19, %add6
+  %cond24 = select i1 %cmp20, i32 %cond19, i32 %add6
+  %cmp25 = icmp ult i32 %cond24, %add8
+  %cond29 = select i1 %cmp25, i32 %cond24, i32 %add8
+  %cmp30 = icmp ult i32 %cond29, %add10
+  %cond34 = select i1 %cmp30, i32 %cond29, i32 %add10
+  %cmp35 = icmp ult i32 %cond34, %add12
+  %cond39 = select i1 %cmp35, i32 %cond34, i32 %add12
+  %cmp40 = icmp ult i32 %cond39, %add14
+  %cond44 = select i1 %cmp40, i32 %cond39, i32 %add14
+  ret i32 %cond44
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
new file mode 100644
index 00000000000..a05e4186134
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -0,0 +1,785 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+avx512bw -basicaa -slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
+
+;
+; vXi8
+;
+
+define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
+; SSE2-LABEL: @loadext_2i8_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i8_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i8_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %x0 = zext i8 %i0 to i64
+  %x1 = zext i8 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
+; CHECK-LABEL: @loadext_4i8_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = zext i8 %i0 to i32
+  %x1 = zext i8 %i1 to i32
+  %x2 = zext i8 %i2 to i32
+  %x3 = zext i8 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
+; SSE2-LABEL: @loadext_4i8_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %x0 = zext i8 %i0 to i64
+  %x1 = zext i8 %i1 to i64
+  %x2 = zext i8 %i2 to i64
+  %x3 = zext i8 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i16> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = zext i8 %i0 to i16
+  %x1 = zext i8 %i1 to i16
+  %x2 = zext i8 %i2 to i16
+  %x3 = zext i8 %i3 to i16
+  %x4 = zext i8 %i4 to i16
+  %x5 = zext i8 %i5 to i16
+  %x6 = zext i8 %i6 to i16
+  %x7 = zext i8 %i7 to i16
+  %v0 = insertelement <8 x i16> undef, i16 %x0, i32 0
+  %v1 = insertelement <8 x i16>   %v0, i16 %x1, i32 1
+  %v2 = insertelement <8 x i16>   %v1, i16 %x2, i32 2
+  %v3 = insertelement <8 x i16>   %v2, i16 %x3, i32 3
+  %v4 = insertelement <8 x i16>   %v3, i16 %x4, i32 4
+  %v5 = insertelement <8 x i16>   %v4, i16 %x5, i32 5
+  %v6 = insertelement <8 x i16>   %v5, i16 %x6, i32 6
+  %v7 = insertelement <8 x i16>   %v6, i16 %x7, i32 7
+  ret <8 x i16> %v7
+}
+
+define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
+; CHECK-LABEL: @loadext_8i8_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2 = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3 = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4 = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5 = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6 = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7 = getelementptr inbounds i8, i8* %p0, i64 7
+  %i0 = load i8, i8* %p0, align 1
+  %i1 = load i8, i8* %p1, align 1
+  %i2 = load i8, i8* %p2, align 1
+  %i3 = load i8, i8* %p3, align 1
+  %i4 = load i8, i8* %p4, align 1
+  %i5 = load i8, i8* %p5, align 1
+  %i6 = load i8, i8* %p6, align 1
+  %i7 = load i8, i8* %p7, align 1
+  %x0 = zext i8 %i0 to i32
+  %x1 = zext i8 %i1 to i32
+  %x2 = zext i8 %i2 to i32
+  %x3 = zext i8 %i3 to i32
+  %x4 = zext i8 %i4 to i32
+  %x5 = zext i8 %i5 to i32
+  %x6 = zext i8 %i6 to i32
+  %x7 = zext i8 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
+; CHECK-LABEL: @loadext_16i8_to_16i16(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; CHECK-NEXT:    ret <16 x i16> [[V15]]
+;
+  %p1  = getelementptr inbounds i8, i8* %p0, i64 1
+  %p2  = getelementptr inbounds i8, i8* %p0, i64 2
+  %p3  = getelementptr inbounds i8, i8* %p0, i64 3
+  %p4  = getelementptr inbounds i8, i8* %p0, i64 4
+  %p5  = getelementptr inbounds i8, i8* %p0, i64 5
+  %p6  = getelementptr inbounds i8, i8* %p0, i64 6
+  %p7  = getelementptr inbounds i8, i8* %p0, i64 7
+  %p8  = getelementptr inbounds i8, i8* %p0, i64 8
+  %p9  = getelementptr inbounds i8, i8* %p0, i64 9
+  %p10 = getelementptr inbounds i8, i8* %p0, i64 10
+  %p11 = getelementptr inbounds i8, i8* %p0, i64 11
+  %p12 = getelementptr inbounds i8, i8* %p0, i64 12
+  %p13 = getelementptr inbounds i8, i8* %p0, i64 13
+  %p14 = getelementptr inbounds i8, i8* %p0, i64 14
+  %p15 = getelementptr inbounds i8, i8* %p0, i64 15
+  %i0  = load i8, i8* %p0,  align 1
+  %i1  = load i8, i8* %p1,  align 1
+  %i2  = load i8, i8* %p2,  align 1
+  %i3  = load i8, i8* %p3,  align 1
+  %i4  = load i8, i8* %p4,  align 1
+  %i5  = load i8, i8* %p5,  align 1
+  %i6  = load i8, i8* %p6,  align 1
+  %i7  = load i8, i8* %p7,  align 1
+  %i8  = load i8, i8* %p8,  align 1
+  %i9  = load i8, i8* %p9,  align 1
+  %i10 = load i8, i8* %p10, align 1
+  %i11 = load i8, i8* %p11, align 1
+  %i12 = load i8, i8* %p12, align 1
+  %i13 = load i8, i8* %p13, align 1
+  %i14 = load i8, i8* %p14, align 1
+  %i15 = load i8, i8* %p15, align 1
+  %x0  = zext i8 %i0  to i16
+  %x1  = zext i8 %i1  to i16
+  %x2  = zext i8 %i2  to i16
+  %x3  = zext i8 %i3  to i16
+  %x4  = zext i8 %i4  to i16
+  %x5  = zext i8 %i5  to i16
+  %x6  = zext i8 %i6  to i16
+  %x7  = zext i8 %i7  to i16
+  %x8  = zext i8 %i8  to i16
+  %x9  = zext i8 %i9  to i16
+  %x10 = zext i8 %i10 to i16
+  %x11 = zext i8 %i11 to i16
+  %x12 = zext i8 %i12 to i16
+  %x13 = zext i8 %i13 to i16
+  %x14 = zext i8 %i14 to i16
+  %x15 = zext i8 %i15 to i16
+  %v0  = insertelement <16 x i16> undef, i16 %x0,  i32 0
+  %v1  = insertelement <16 x i16>  %v0,  i16 %x1,  i32 1
+  %v2  = insertelement <16 x i16>  %v1,  i16 %x2,  i32 2
+  %v3  = insertelement <16 x i16>  %v2,  i16 %x3,  i32 3
+  %v4  = insertelement <16 x i16>  %v3,  i16 %x4,  i32 4
+  %v5  = insertelement <16 x i16>  %v4,  i16 %x5,  i32 5
+  %v6  = insertelement <16 x i16>  %v5,  i16 %x6,  i32 6
+  %v7  = insertelement <16 x i16>  %v6,  i16 %x7,  i32 7
+  %v8  = insertelement <16 x i16>  %v7,  i16 %x8,  i32 8
+  %v9  = insertelement <16 x i16>  %v8,  i16 %x9,  i32 9
+  %v10 = insertelement <16 x i16>  %v9,  i16 %x10, i32 10
+  %v11 = insertelement <16 x i16>  %v10, i16 %x11, i32 11
+  %v12 = insertelement <16 x i16>  %v11, i16 %x12, i32 12
+  %v13 = insertelement <16 x i16>  %v12, i16 %x13, i32 13
+  %v14 = insertelement <16 x i16>  %v13, i16 %x14, i32 14
+  %v15 = insertelement <16 x i16>  %v14, i16 %x15, i32 15
+  ret <16 x i16> %v15
+}
+
+;
+; vXi16
+;
+
+define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
+; SSE2-LABEL: @loadext_2i16_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i16_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i16_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %x0 = zext i16 %i0 to i64
+  %x1 = zext i16 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
+; CHECK-LABEL: @loadext_4i16_to_4i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = zext i16 %i0 to i32
+  %x1 = zext i16 %i1 to i32
+  %x2 = zext i16 %i2 to i32
+  %x3 = zext i16 %i3 to i32
+  %v0 = insertelement <4 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <4 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <4 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <4 x i32>   %v2, i32 %x3, i32 3
+  ret <4 x i32> %v3
+}
+
+define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
+; SSE2-LABEL: @loadext_4i16_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; AVX-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %x0 = zext i16 %i0 to i64
+  %x1 = zext i16 %i1 to i64
+  %x2 = zext i16 %i2 to i64
+  %x3 = zext i16 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
+
+define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
+; CHECK-LABEL: @loadext_8i16_to_8i32(
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[V7]]
+;
+  %p1 = getelementptr inbounds i16, i16* %p0, i64 1
+  %p2 = getelementptr inbounds i16, i16* %p0, i64 2
+  %p3 = getelementptr inbounds i16, i16* %p0, i64 3
+  %p4 = getelementptr inbounds i16, i16* %p0, i64 4
+  %p5 = getelementptr inbounds i16, i16* %p0, i64 5
+  %p6 = getelementptr inbounds i16, i16* %p0, i64 6
+  %p7 = getelementptr inbounds i16, i16* %p0, i64 7
+  %i0 = load i16, i16* %p0, align 1
+  %i1 = load i16, i16* %p1, align 1
+  %i2 = load i16, i16* %p2, align 1
+  %i3 = load i16, i16* %p3, align 1
+  %i4 = load i16, i16* %p4, align 1
+  %i5 = load i16, i16* %p5, align 1
+  %i6 = load i16, i16* %p6, align 1
+  %i7 = load i16, i16* %p7, align 1
+  %x0 = zext i16 %i0 to i32
+  %x1 = zext i16 %i1 to i32
+  %x2 = zext i16 %i2 to i32
+  %x3 = zext i16 %i3 to i32
+  %x4 = zext i16 %i4 to i32
+  %x5 = zext i16 %i5 to i32
+  %x6 = zext i16 %i6 to i32
+  %x7 = zext i16 %i7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %x0, i32 0
+  %v1 = insertelement <8 x i32>   %v0, i32 %x1, i32 1
+  %v2 = insertelement <8 x i32>   %v1, i32 %x2, i32 2
+  %v3 = insertelement <8 x i32>   %v2, i32 %x3, i32 3
+  %v4 = insertelement <8 x i32>   %v3, i32 %x4, i32 4
+  %v5 = insertelement <8 x i32>   %v4, i32 %x5, i32 5
+  %v6 = insertelement <8 x i32>   %v5, i32 %x6, i32 6
+  %v7 = insertelement <8 x i32>   %v6, i32 %x7, i32 7
+  ret <8 x i32> %v7
+}
+
+;
+; vXi32
+;
+
+define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
+; SSE2-LABEL: @loadext_2i32_to_2i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    ret <2 x i64> [[V1]]
+;
+; SLM-LABEL: @loadext_2i32_to_2i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    ret <2 x i64> [[V1]]
+;
+; AVX-LABEL: @loadext_2i32_to_2i64(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX-NEXT:    ret <2 x i64> [[V1]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %x0 = zext i32 %i0 to i64
+  %x1 = zext i32 %i1 to i64
+  %v0 = insertelement <2 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <2 x i64>   %v0, i64 %x1, i32 1
+  ret <2 x i64> %v1
+}
+
+define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
+; SSE2-LABEL: @loadext_4i32_to_4i64(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE2-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; SSE2-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE2-NEXT:    ret <4 x i64> [[V3]]
+;
+; SLM-LABEL: @loadext_4i32_to_4i64(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; SLM-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX1-LABEL: @loadext_4i32_to_4i64(
+; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX1-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX1-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX1-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
+; AVX1-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
+; AVX1-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; AVX1-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; AVX1-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
+; AVX1-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; AVX1-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; AVX1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i32_to_4i64(
+; AVX2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX2-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX2-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT:    ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i32_to_4i64(
+; AVX512-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; AVX512-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; AVX512-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; AVX512-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
+; AVX512-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
+; AVX512-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT:    ret <4 x i64> [[V3]]
+;
+  %p1 = getelementptr inbounds i32, i32* %p0, i64 1
+  %p2 = getelementptr inbounds i32, i32* %p0, i64 2
+  %p3 = getelementptr inbounds i32, i32* %p0, i64 3
+  %i0 = load i32, i32* %p0, align 1
+  %i1 = load i32, i32* %p1, align 1
+  %i2 = load i32, i32* %p2, align 1
+  %i3 = load i32, i32* %p3, align 1
+  %x0 = zext i32 %i0 to i64
+  %x1 = zext i32 %i1 to i64
+  %x2 = zext i32 %i2 to i64
+  %x3 = zext i32 %i3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %x0, i32 0
+  %v1 = insertelement <4 x i64>   %v0, i64 %x1, i32 1
+  %v2 = insertelement <4 x i64>   %v1, i64 %x2, i32 2
+  %v3 = insertelement <4 x i64>   %v2, i64 %x3, i32 3
+  ret <4 x i64> %v3
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/XCore/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
new file mode 100644
index 00000000000..bb48713fe33
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/XCore/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'XCore' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll b/llvm/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll
new file mode 100644
index 00000000000..d84a4993ec3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/XCore/no-vector-registers.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=xcore  | FileCheck %s
+
+target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
+target triple = "xcore"
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I0:%.*]] = load double, double* [[A:%.*]], align 8
+; CHECK-NEXT:    [[I1:%.*]] = load double, double* [[B:%.*]], align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
+; CHECK-NEXT:    [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
+; CHECK-NEXT:    [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8
+; CHECK-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
+; CHECK-NEXT:    store double [[MUL]], double* [[C:%.*]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C]], i64 1
+; CHECK-NEXT:    store double [[MUL5]], double* [[ARRAYIDX5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i0 = load double, double* %a, align 8
+  %i1 = load double, double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
+  %i3 = load double, double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
+  %i4 = load double, double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
new file mode 100644
index 00000000000..aab3f028f8d
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -slp-vectorizer -slp-max-reg-size=128 -slp-min-reg-size=128 | FileCheck %s
+
+declare void @llvm.sideeffect()
+
+; SLP vectorization across a @llvm.sideeffect.
+
+define void @test(float* %p) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[P0:%.*]] = getelementptr float, float* [[P:%.*]], i64 0
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr float, float* [[P]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr float, float* [[P]], i64 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr float, float* [[P]], i64 3
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    call void @llvm.sideeffect()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    ret void
+;
+  %p0 = getelementptr float, float* %p, i64 0
+  %p1 = getelementptr float, float* %p, i64 1
+  %p2 = getelementptr float, float* %p, i64 2
+  %p3 = getelementptr float, float* %p, i64 3
+  %l0 = load float, float* %p0
+  %l1 = load float, float* %p1
+  %l2 = load float, float* %p2
+  call void @llvm.sideeffect()
+  %l3 = load float, float* %p3
+  store float %l0, float* %p0
+  call void @llvm.sideeffect()
+  store float %l1, float* %p1
+  store float %l2, float* %p2
+  store float %l3, float* %p3
+  ret void
+}
author	Eric Christopher <echristo@gmail.com>	2019-04-17 04:52:47 +0000
committer	Eric Christopher <echristo@gmail.com>	2019-04-17 04:52:47 +0000
commit	cee313d288a4faf0355d76fb6e0e927e211d08a5 (patch)
tree	d386075318d761197779a96e5d8fc0dc7b06342b /llvm/test/Transforms/SLPVectorizer
parent	c3d6a929fdd92fd06d4304675ade8d7210ee711a (diff)
download	bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.tar.gz bcm5719-llvm-cee313d288a4faf0355d76fb6e0e927e211d08a5.zip