summaryrefslogtreecommitdiffstats
path: root/llvm/test/Transforms/SLPVectorizer
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/Transforms/SLPVectorizer')
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll98
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll41
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/hadd.ll57
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/hsub.ll57
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/sext.ll938
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/zext.ll954
6 files changed, 1302 insertions, 843 deletions
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 8f8b1d443da..9ee016e4331 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -35,30 +35,9 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
; SSE-NEXT: ret <8 x float> [[R7]]
;
; SLM-LABEL: @sitofp_uitofp(
-; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SLM-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SLM-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SLM-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SLM-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SLM-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SLM-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SLM-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SLM-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SLM-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; SLM-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; SLM-NEXT: ret <8 x float> [[R7]]
;
; AVX-LABEL: @sitofp_uitofp(
@@ -268,11 +247,50 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
}
define <8 x i32> @sext_zext(<8 x i16> %a) {
-; CHECK-LABEL: @sext_zext(
-; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; CHECK-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: ret <8 x i32> [[R7]]
+; SSE-LABEL: @sext_zext(
+; SSE-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; SSE-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @sext_zext(
+; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i16> [[A:%.*]], i32 0
+; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i16> [[A]], i32 1
+; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A]], i32 2
+; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i32 3
+; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i32 4
+; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i32 5
+; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i32 6
+; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i32 7
+; SLM-NEXT: [[AB0:%.*]] = sext i16 [[A0]] to i32
+; SLM-NEXT: [[AB1:%.*]] = sext i16 [[A1]] to i32
+; SLM-NEXT: [[AB2:%.*]] = sext i16 [[A2]] to i32
+; SLM-NEXT: [[AB3:%.*]] = sext i16 [[A3]] to i32
+; SLM-NEXT: [[AB4:%.*]] = zext i16 [[A4]] to i32
+; SLM-NEXT: [[AB5:%.*]] = zext i16 [[A5]] to i32
+; SLM-NEXT: [[AB6:%.*]] = zext i16 [[A6]] to i32
+; SLM-NEXT: [[AB7:%.*]] = zext i16 [[A7]] to i32
+; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT: ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @sext_zext(
+; AVX-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; AVX-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT: ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @sext_zext(
+; AVX512-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; AVX512-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: ret <8 x i32> [[R7]]
;
%a0 = extractelement <8 x i16> %a, i32 0
%a1 = extractelement <8 x i16> %a, i32 1
@@ -383,26 +401,24 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16
; SSE-NEXT: ret <8 x float> [[R7]]
;
; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
; SLM-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
; SLM-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SLM-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SLM-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SLM-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float
-; SLM-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SLM-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
; SLM-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float
; SLM-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float
; SLM-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float
; SLM-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
+; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
+; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
+; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 44729b4a8d5..23d1634fdb6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -75,42 +75,11 @@ define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: @add_mul_v4i32(
-; SSE-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; SSE-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; SSE-NEXT: ret <4 x i32> [[R3]]
-;
-; SLM-LABEL: @add_mul_v4i32(
-; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
-; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
-; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1
-; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2
-; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3
-; SLM-NEXT: [[AB0:%.*]] = mul i32 [[A0]], [[B0]]
-; SLM-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B1]]
-; SLM-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B2]]
-; SLM-NEXT: [[AB3:%.*]] = mul i32 [[A3]], [[B3]]
-; SLM-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0
-; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1
-; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2
-; SLM-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3
-; SLM-NEXT: ret <4 x i32> [[R3]]
-;
-; AVX-LABEL: @add_mul_v4i32(
-; AVX-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; AVX-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT: ret <4 x i32> [[R3]]
-;
-; AVX512-LABEL: @add_mul_v4i32(
-; AVX512-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; AVX512-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX512-NEXT: ret <4 x i32> [[R3]]
+; CHECK-LABEL: @add_mul_v4i32(
+; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; CHECK-NEXT: [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT: ret <4 x i32> [[R3]]
;
%a0 = extractelement <4 x i32> %a, i32 0
%a1 = extractelement <4 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index b02244f9614..71f72a93075 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -78,34 +78,11 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
}
define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE-LABEL: @test_v2i64(
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT: ret <2 x i64> [[TMP3]]
-;
-; SLM-LABEL: @test_v2i64(
-; SLM-NEXT: [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
-; SLM-NEXT: [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
-; SLM-NEXT: [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; SLM-NEXT: [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
-; SLM-NEXT: [[R0:%.*]] = add i64 [[A0]], [[A1]]
-; SLM-NEXT: [[R1:%.*]] = add i64 [[B0]], [[B1]]
-; SLM-NEXT: [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
-; SLM-NEXT: [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
-; SLM-NEXT: ret <2 x i64> [[R01]]
-;
-; AVX-LABEL: @test_v2i64(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT: ret <2 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v2i64(
-; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX512-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT: ret <2 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <2 x i64> [[TMP3]]
;
%a0 = extractelement <2 x i64> %a, i32 0
%a1 = extractelement <2 x i64> %a, i32 1
@@ -322,14 +299,10 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE-NEXT: ret <4 x i64> [[R03]]
;
; SLM-LABEL: @test_v4i64(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: ret <4 x i64> [[R03]]
+; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SLM-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT: ret <4 x i64> [[TMP3]]
;
; AVX-LABEL: @test_v4i64(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -374,14 +347,10 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-NEXT: ret <8 x i32> [[R07]]
;
; SLM-LABEL: @test_v8i32(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT: [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[R07]]
+; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT: ret <8 x i32> [[TMP3]]
;
; AVX-LABEL: @test_v8i32(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index d6e44aa1d6a..b7e487eed9e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -78,34 +78,11 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
}
define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE-LABEL: @test_v2i64(
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT: ret <2 x i64> [[TMP3]]
-;
-; SLM-LABEL: @test_v2i64(
-; SLM-NEXT: [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
-; SLM-NEXT: [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
-; SLM-NEXT: [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; SLM-NEXT: [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
-; SLM-NEXT: [[R0:%.*]] = sub i64 [[A0]], [[A1]]
-; SLM-NEXT: [[R1:%.*]] = sub i64 [[B0]], [[B1]]
-; SLM-NEXT: [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
-; SLM-NEXT: [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
-; SLM-NEXT: ret <2 x i64> [[R01]]
-;
-; AVX-LABEL: @test_v2i64(
-; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT: ret <2 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v2i64(
-; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX512-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT: ret <2 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <2 x i64> [[TMP3]]
;
%a0 = extractelement <2 x i64> %a, i32 0
%a1 = extractelement <2 x i64> %a, i32 1
@@ -322,14 +299,10 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE-NEXT: ret <4 x i64> [[R03]]
;
; SLM-LABEL: @test_v4i64(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT: ret <4 x i64> [[R03]]
+; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SLM-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT: ret <4 x i64> [[TMP3]]
;
; AVX-LABEL: @test_v4i64(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -374,14 +347,10 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-NEXT: ret <8 x i32> [[R07]]
;
; SLM-LABEL: @test_v8i32(
-; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT: [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT: ret <8 x i32> [[R07]]
+; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT: ret <8 x i32> [[TMP3]]
;
; AVX-LABEL: @test_v8i32(
; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
index f3404831e21..c3eba4701e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -11,26 +11,15 @@
;
define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE2-LABEL: @loadext_2i8_to_2i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i8_to_2i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i8_to_2i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: ret <2 x i64> [[V1]]
;
; AVX-LABEL: @loadext_2i8_to_2i64(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -54,40 +43,23 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
}
define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i32(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32
-; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32
-; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32
-; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32
-; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SSE2-NEXT: ret <4 x i32> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i32(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SLM-NEXT: ret <4 x i32> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i32(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32
+; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32
+; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32
+; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32
+; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SSE-NEXT: ret <4 x i32> [[V3]]
;
; AVX-LABEL: @loadext_4i8_to_4i32(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -125,40 +97,23 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
}
define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64
-; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT: ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT: ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64
+; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT: ret <4 x i64> [[V3]]
;
; AVX1-LABEL: @loadext_4i8_to_4i64(
; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -232,34 +187,97 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
}
define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i16(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT: ret <8 x i16> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i16(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT: ret <8 x i16> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i16(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16
+; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16
+; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16
+; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16
+; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16
+; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16
+; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16
+; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16
+; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT: ret <8 x i16> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i16(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT: ret <8 x i16> [[V7]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -296,34 +314,97 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
}
define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT: ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i32(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT: ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i32(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i32
+; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i32
+; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i32
+; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i32
+; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32
+; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32
+; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32
+; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32
+; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT: ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i32(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT: ret <8 x i32> [[V7]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -360,58 +441,177 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
}
define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; CHECK-LABEL: @loadext_16i8_to_16i16(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; CHECK-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; CHECK-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; CHECK-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; CHECK-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; CHECK-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; CHECK-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; CHECK-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; CHECK-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; CHECK-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; CHECK-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; CHECK-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; CHECK-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; CHECK-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; CHECK-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; CHECK-NEXT: ret <16 x i16> [[V15]]
+; SSE2-LABEL: @loadext_16i8_to_16i16(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE2-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE2-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE2-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE2-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE2-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE2-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE2-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; SSE2-NEXT: ret <16 x i16> [[V15]]
+;
+; SLM-LABEL: @loadext_16i8_to_16i16(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SLM-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SLM-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SLM-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SLM-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1
+; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1
+; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1
+; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1
+; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1
+; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1
+; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1
+; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1
+; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16
+; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16
+; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16
+; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16
+; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16
+; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16
+; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16
+; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16
+; SLM-NEXT: [[X8:%.*]] = sext i8 [[I8]] to i16
+; SLM-NEXT: [[X9:%.*]] = sext i8 [[I9]] to i16
+; SLM-NEXT: [[X10:%.*]] = sext i8 [[I10]] to i16
+; SLM-NEXT: [[X11:%.*]] = sext i8 [[I11]] to i16
+; SLM-NEXT: [[X12:%.*]] = sext i8 [[I12]] to i16
+; SLM-NEXT: [[X13:%.*]] = sext i8 [[I13]] to i16
+; SLM-NEXT: [[X14:%.*]] = sext i8 [[I14]] to i16
+; SLM-NEXT: [[X15:%.*]] = sext i8 [[I15]] to i16
+; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
+; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
+; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
+; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
+; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
+; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
+; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
+; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
+; SLM-NEXT: ret <16 x i16> [[V15]]
+;
+; AVX-LABEL: @loadext_16i8_to_16i16(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; AVX-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; AVX-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; AVX-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; AVX-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; AVX-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; AVX-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; AVX-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; AVX-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; AVX-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; AVX-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; AVX-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; AVX-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; AVX-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; AVX-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; AVX-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; AVX-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; AVX-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; AVX-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; AVX-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; AVX-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; AVX-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; AVX-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; AVX-NEXT: ret <16 x i16> [[V15]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -484,26 +684,15 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
;
define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE2-LABEL: @loadext_2i16_to_2i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i16_to_2i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i16_to_2i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: ret <2 x i64> [[V1]]
;
; AVX-LABEL: @loadext_2i16_to_2i64(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -527,22 +716,57 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
}
define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; CHECK-LABEL: @loadext_4i16_to_4i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT: ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i32(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT: ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i32(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32
+; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32
+; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32
+; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32
+; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT: ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i32(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT: ret <4 x i32> [[V3]]
;
%p1 = getelementptr inbounds i16, i16* %p0, i64 1
%p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -563,40 +787,23 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
}
define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE2-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE2-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64
-; SSE2-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT: ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT: ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT: ret <4 x i64> [[V3]]
;
; AVX1-LABEL: @loadext_4i16_to_4i64(
; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -670,34 +877,97 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
}
define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; CHECK-LABEL: @loadext_8i16_to_8i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT: ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i16_to_8i32(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT: ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i16_to_8i32(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1
+; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i32
+; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i32
+; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i32
+; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i32
+; SLM-NEXT: [[X4:%.*]] = sext i16 [[I4]] to i32
+; SLM-NEXT: [[X5:%.*]] = sext i16 [[I5]] to i32
+; SLM-NEXT: [[X6:%.*]] = sext i16 [[I6]] to i32
+; SLM-NEXT: [[X7:%.*]] = sext i16 [[I7]] to i32
+; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT: ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i16_to_8i32(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT: ret <8 x i32> [[V7]]
;
%p1 = getelementptr inbounds i16, i16* %p0, i64 1
%p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -738,26 +1008,15 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
;
define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE2-LABEL: @loadext_2i32_to_2i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i32_to_2i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i32_to_2i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: ret <2 x i64> [[V1]]
;
; AVX-LABEL: @loadext_2i32_to_2i64(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -781,40 +1040,23 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
}
define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE2-LABEL: @loadext_4i32_to_4i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE2-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE2-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64
-; SSE2-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT: ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i32_to_4i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT: ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i32_to_4i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64
+; SSE-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT: ret <4 x i64> [[V3]]
;
; AVX1-LABEL: @loadext_4i32_to_4i64(
; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
index d82aeb85676..ead4ffdeb0f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -11,26 +11,15 @@
;
define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE2-LABEL: @loadext_2i8_to_2i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i8_to_2i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i8_to_2i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: ret <2 x i64> [[V1]]
;
; AVX-LABEL: @loadext_2i8_to_2i64(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -54,22 +43,57 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
}
define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; CHECK-LABEL: @loadext_4i8_to_4i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT: ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT: ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32
+; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32
+; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32
+; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32
+; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT: ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i32(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT: ret <4 x i32> [[V3]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -90,40 +114,23 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
}
define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE2-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
-; SSE2-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT: ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT: ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
+; SSE-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT: ret <4 x i64> [[V3]]
;
; AVX1-LABEL: @loadext_4i8_to_4i64(
; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -197,34 +204,97 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
}
define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i16(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT: ret <8 x i16> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i16(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT: ret <8 x i16> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i16(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16
+; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16
+; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16
+; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16
+; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16
+; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16
+; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16
+; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16
+; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT: ret <8 x i16> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i16(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT: ret <8 x i16> [[V7]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -261,34 +331,97 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
}
define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT: ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i32(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT: ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i32(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i32
+; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i32
+; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i32
+; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i32
+; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i32
+; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i32
+; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i32
+; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i32
+; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT: ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i32(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT: ret <8 x i32> [[V7]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -325,58 +458,177 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
}
define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; CHECK-LABEL: @loadext_16i8_to_16i16(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; CHECK-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; CHECK-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; CHECK-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; CHECK-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; CHECK-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; CHECK-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; CHECK-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; CHECK-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; CHECK-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; CHECK-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; CHECK-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; CHECK-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; CHECK-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; CHECK-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; CHECK-NEXT: ret <16 x i16> [[V15]]
+; SSE2-LABEL: @loadext_16i8_to_16i16(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE2-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE2-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE2-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE2-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE2-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE2-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE2-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; SSE2-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; SSE2-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; SSE2-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; SSE2-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; SSE2-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; SSE2-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; SSE2-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; SSE2-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; SSE2-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; SSE2-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; SSE2-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; SSE2-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; SSE2-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; SSE2-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; SSE2-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; SSE2-NEXT: ret <16 x i16> [[V15]]
+;
+; SLM-LABEL: @loadext_16i8_to_16i16(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SLM-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SLM-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SLM-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SLM-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1
+; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1
+; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1
+; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1
+; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1
+; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1
+; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1
+; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1
+; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16
+; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16
+; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16
+; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16
+; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16
+; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16
+; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16
+; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16
+; SLM-NEXT: [[X8:%.*]] = zext i8 [[I8]] to i16
+; SLM-NEXT: [[X9:%.*]] = zext i8 [[I9]] to i16
+; SLM-NEXT: [[X10:%.*]] = zext i8 [[I10]] to i16
+; SLM-NEXT: [[X11:%.*]] = zext i8 [[I11]] to i16
+; SLM-NEXT: [[X12:%.*]] = zext i8 [[I12]] to i16
+; SLM-NEXT: [[X13:%.*]] = zext i8 [[I13]] to i16
+; SLM-NEXT: [[X14:%.*]] = zext i8 [[I14]] to i16
+; SLM-NEXT: [[X15:%.*]] = zext i8 [[I15]] to i16
+; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
+; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
+; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
+; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
+; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
+; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
+; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
+; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
+; SLM-NEXT: ret <16 x i16> [[V15]]
+;
+; AVX-LABEL: @loadext_16i8_to_16i16(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; AVX-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; AVX-NEXT: [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; AVX-NEXT: [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; AVX-NEXT: [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; AVX-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; AVX-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; AVX-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; AVX-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT: [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; AVX-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; AVX-NEXT: [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; AVX-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; AVX-NEXT: [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; AVX-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; AVX-NEXT: [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; AVX-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; AVX-NEXT: [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; AVX-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; AVX-NEXT: [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; AVX-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; AVX-NEXT: [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; AVX-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; AVX-NEXT: [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; AVX-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; AVX-NEXT: ret <16 x i16> [[V15]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -449,26 +701,15 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
;
define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE2-LABEL: @loadext_2i16_to_2i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i16_to_2i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i16_to_2i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: ret <2 x i64> [[V1]]
;
; AVX-LABEL: @loadext_2i16_to_2i64(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -492,22 +733,57 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
}
define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; CHECK-LABEL: @loadext_4i16_to_4i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT: ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i32(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT: ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i32(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32
+; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32
+; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32
+; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32
+; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT: ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i32(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT: ret <4 x i32> [[V3]]
;
%p1 = getelementptr inbounds i16, i16* %p0, i64 1
%p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -528,40 +804,23 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
}
define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE2-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE2-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
-; SSE2-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT: ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT: ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
+; SSE-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT: ret <4 x i64> [[V3]]
;
; AVX1-LABEL: @loadext_4i16_to_4i64(
; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -635,34 +894,97 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
}
define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; CHECK-LABEL: @loadext_8i16_to_8i32(
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT: ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i16_to_8i32(
+; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE2-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE2-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE2-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE2-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT: ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i16_to_8i32(
+; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT: [[I4:%.*]] = load i16, i16* [[P4]], align 1
+; SLM-NEXT: [[I5:%.*]] = load i16, i16* [[P5]], align 1
+; SLM-NEXT: [[I6:%.*]] = load i16, i16* [[P6]], align 1
+; SLM-NEXT: [[I7:%.*]] = load i16, i16* [[P7]], align 1
+; SLM-NEXT: [[X0:%.*]] = zext i16 [[I0]] to i32
+; SLM-NEXT: [[X1:%.*]] = zext i16 [[I1]] to i32
+; SLM-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i32
+; SLM-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i32
+; SLM-NEXT: [[X4:%.*]] = zext i16 [[I4]] to i32
+; SLM-NEXT: [[X5:%.*]] = zext i16 [[I5]] to i32
+; SLM-NEXT: [[X6:%.*]] = zext i16 [[I6]] to i32
+; SLM-NEXT: [[X7:%.*]] = zext i16 [[I7]] to i32
+; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT: ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i16_to_8i32(
+; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; AVX-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; AVX-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; AVX-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; AVX-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; AVX-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT: ret <8 x i32> [[V7]]
;
%p1 = getelementptr inbounds i16, i16* %p0, i64 1
%p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -703,26 +1025,15 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
;
define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE2-LABEL: @loadext_2i32_to_2i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i32_to_2i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; SLM-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i32_to_2i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: ret <2 x i64> [[V1]]
;
; AVX-LABEL: @loadext_2i32_to_2i64(
; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -746,40 +1057,23 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
}
define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE2-LABEL: @loadext_4i32_to_4i64(
-; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE2-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE2-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE2-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE2-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
-; SSE2-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
-; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT: ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i32_to_4i64(
-; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; SLM-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT: ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i32_to_4i64(
+; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE-NEXT: [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE-NEXT: [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64
+; SSE-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64
+; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT: ret <4 x i64> [[V3]]
;
; AVX1-LABEL: @loadext_4i32_to_4i64(
; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
OpenPOWER on IntegriCloud