summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2019-08-14 14:52:39 +0000
committerCraig Topper <craig.topper@intel.com>2019-08-14 14:52:39 +0000
commit30d3e9c3952096e80c3c82cf61caa545fc678170 (patch)
tree140acd19654ce67f99d1a80be403b6cd113a8c10
parent8c545168ee4d0b446565c655230c7be2e3d56fe9 (diff)
downloadbcm5719-llvm-30d3e9c3952096e80c3c82cf61caa545fc678170.tar.gz
bcm5719-llvm-30d3e9c3952096e80c3c82cf61caa545fc678170.zip
[X86][CostModel] Adjust the costs of ZERO_EXTEND/SIGN_EXTEND with less than 128-bit inputs
Now that we legalize by widening, the element types here won't change. Previously these were modeled as the elements being widened and then the instruction might become an AND or SHL/ASHR pair. But now they'll become something like a ZERO_EXTEND_VECTOR_INREG/SIGN_EXTEND_VECTOR_INREG. For AVX2, when the destination type is legal its clear the cost should be 1 since we have extend instructions that can produce 256 bit vectors from less than 128 bit vectors. I'm a little less sure about AVX1 costs, but I think the ones I changed were definitely too high, but they might still be too high. Differential Revision: https://reviews.llvm.org/D66169 llvm-svn: 368858
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp22
-rw-r--r--llvm/test/Analysis/CostModel/X86/cast.ll30
-rw-r--r--llvm/test/Analysis/CostModel/X86/extend.ll68
-rw-r--r--llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll18
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/cast.ll43
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/sext.ll287
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/zext.ll140
7 files changed, 332 insertions, 276 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0077ff70c56..92f0b907d50 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1314,8 +1314,10 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
@@ -1371,14 +1373,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 3 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
@@ -1402,13 +1404,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 7 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 74402f53cba..4a52a01b652 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -139,11 +139,11 @@ define i32 @zext_sext(<8 x i1> %in) {
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %B = zext <8 x i16> undef to <8 x i32>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = sext <4 x i32> undef to <4 x i64>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <4 x i32> undef to <4 x i64>
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
@@ -168,12 +168,12 @@ define i32 @zext_sext(<8 x i1> %in) {
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64>
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
@@ -198,12 +198,12 @@ define i32 @zext_sext(<8 x i1> %in) {
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i16> undef to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = zext <8 x i16> undef to <8 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <4 x i32> undef to <4 x i64>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.z = zext <8 x i8> undef to <8 x i32>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v8i8.s = sext <8 x i8> undef to <8 x i32>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.z = zext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i16.s = sext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.z = zext <4 x i8> undef to <4 x i64>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C.v4i8.s = sext <4 x i8> undef to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = zext <4 x i32> undef to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D1 = zext <8 x i32> undef to <8 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D2 = sext <8 x i32> undef to <8 x i64>
diff --git a/llvm/test/Analysis/CostModel/X86/extend.ll b/llvm/test/Analysis/CostModel/X86/extend.ll
index 1de0b0b20e1..6ed4cdfde08 100644
--- a/llvm/test/Analysis/CostModel/X86/extend.ll
+++ b/llvm/test/Analysis/CostModel/X86/extend.ll
@@ -104,8 +104,8 @@ define i32 @zext_vXi16() {
;
; AVX2-LABEL: 'zext_vXi16'
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i16> undef to <8 x i32>
@@ -114,7 +114,7 @@ define i32 @zext_vXi16() {
;
; AVX512-LABEL: 'zext_vXi16'
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i16> undef to <2 x i64>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i16> undef to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i16> undef to <8 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i16> undef to <2 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i16> undef to <4 x i32>
@@ -207,11 +207,11 @@ define i32 @zext_vXi8() {
;
; AVX2-LABEL: 'zext_vXi8'
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
@@ -222,11 +222,11 @@ define i32 @zext_vXi8() {
;
; AVX512F-LABEL: 'zext_vXi8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
@@ -237,11 +237,11 @@ define i32 @zext_vXi8() {
;
; AVX512BW-LABEL: 'zext_vXi8'
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = zext <2 x i8> undef to <2 x i64>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = zext <4 x i8> undef to <4 x i64>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = zext <8 x i8> undef to <8 x i64>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = zext <2 x i8> undef to <2 x i32>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = zext <4 x i8> undef to <4 x i32>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = zext <8 x i8> undef to <8 x i32>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = zext <16 x i8> undef to <16 x i32>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = zext <2 x i8> undef to <2 x i16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = zext <4 x i8> undef to <4 x i16>
@@ -518,8 +518,8 @@ define i32 @sext_vXi16() {
;
; AVX1-LABEL: 'sext_vXi16'
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
@@ -528,8 +528,8 @@ define i32 @sext_vXi16() {
;
; AVX2-LABEL: 'sext_vXi16'
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
@@ -538,7 +538,7 @@ define i32 @sext_vXi16() {
;
; AVX512-LABEL: 'sext_vXi16'
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
-; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
@@ -548,8 +548,8 @@ define i32 @sext_vXi16() {
;
; BTVER2-LABEL: 'sext_vXi16'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i16> undef to <2 x i64>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i16> undef to <4 x i64>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i16> undef to <8 x i64>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i16> undef to <2 x i32>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i16> undef to <4 x i32>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i16> undef to <8 x i32>
@@ -616,11 +616,11 @@ define i32 @sext_vXi8() {
;
; AVX1-LABEL: 'sext_vXi8'
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
-; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
@@ -631,11 +631,11 @@ define i32 @sext_vXi8() {
;
; AVX2-LABEL: 'sext_vXi8'
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
-; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
@@ -646,11 +646,11 @@ define i32 @sext_vXi8() {
;
; AVX512F-LABEL: 'sext_vXi8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
-; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
@@ -661,11 +661,11 @@ define i32 @sext_vXi8() {
;
; AVX512BW-LABEL: 'sext_vXi8'
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
-; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
@@ -676,11 +676,11 @@ define i32 @sext_vXi8() {
;
; BTVER2-LABEL: 'sext_vXi8'
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sext <2 x i8> undef to <2 x i64>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sext <4 x i8> undef to <4 x i64>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = sext <8 x i8> undef to <8 x i64>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = sext <2 x i8> undef to <2 x i32>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = sext <4 x i8> undef to <4 x i32>
-; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
+; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sext <8 x i8> undef to <8 x i32>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = sext <16 x i8> undef to <16 x i32>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = sext <2 x i8> undef to <2 x i16>
; BTVER2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = sext <4 x i8> undef to <4 x i16>
diff --git a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
index 028ed87a989..1f354e2cf88 100644
--- a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
+++ b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
@@ -5,7 +5,7 @@
define void @zext256() "min-legal-vector-width"="256" {
; VEC256-LABEL: 'zext256'
-; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64>
+; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = zext <8 x i16> undef to <8 x i64>
; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64>
; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32>
; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32>
@@ -30,7 +30,7 @@ define void @zext256() "min-legal-vector-width"="256" {
define void @zext512() "min-legal-vector-width"="512" {
; AVX-LABEL: 'zext512'
-; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = zext <8 x i16> undef to <8 x i64>
+; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = zext <8 x i16> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = zext <8 x i32> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %C = zext <16 x i8> undef to <16 x i32>
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = zext <16 x i16> undef to <16 x i32>
@@ -63,8 +63,8 @@ define void @zext512() "min-legal-vector-width"="512" {
define void @sext256() "min-legal-vector-width"="256" {
; VEC256-LABEL: 'sext256'
-; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64>
-; VEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64>
+; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = sext <8 x i16> undef to <8 x i64>
; VEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64>
; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32>
; VEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32>
@@ -72,7 +72,7 @@ define void @sext256() "min-legal-vector-width"="256" {
; VEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; VEC512-LABEL: 'sext256'
-; VEC512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i8> undef to <8 x i64>
; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64>
; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64>
; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32>
@@ -91,8 +91,8 @@ define void @sext256() "min-legal-vector-width"="256" {
define void @sext512() "min-legal-vector-width"="512" {
; AVX-LABEL: 'sext512'
-; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %A = sext <8 x i8> undef to <8 x i64>
-; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %B = sext <8 x i16> undef to <8 x i64>
+; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %B = sext <8 x i16> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %C = sext <8 x i32> undef to <8 x i64>
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %D = sext <16 x i8> undef to <16 x i32>
; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %E = sext <16 x i16> undef to <16 x i32>
@@ -100,7 +100,7 @@ define void @sext512() "min-legal-vector-width"="512" {
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; SKX256-LABEL: 'sext512'
-; SKX256-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i8> undef to <8 x i64>
; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64>
; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64>
; SKX256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32>
@@ -109,7 +109,7 @@ define void @sext512() "min-legal-vector-width"="512" {
; SKX256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; VEC512-LABEL: 'sext512'
-; VEC512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %A = sext <8 x i8> undef to <8 x i64>
+; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %A = sext <8 x i8> undef to <8 x i64>
; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %B = sext <8 x i16> undef to <8 x i64>
; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %C = sext <8 x i32> undef to <8 x i64>
; VEC512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %D = sext <16 x i8> undef to <16 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast.ll
index 2f9f84948ea..5d023008e60 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cast.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=CHECK,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -basicaa -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=CHECK,AVX
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
@@ -76,21 +76,30 @@ entry:
}
define i64 @test_sext_4i16_to_4i64(i64* noalias nocapture %A, i16* noalias nocapture %B) {
-; CHECK-LABEL: @test_sext_4i16_to_4i64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>*
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
-; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
-; CHECK-NEXT: ret i64 undef
+; SSE42-LABEL: @test_sext_4i16_to_4i64(
+; SSE42-NEXT: entry:
+; SSE42-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <2 x i16>*
+; SSE42-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* [[TMP0]], align 1
+; SSE42-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i64>
+; SSE42-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>*
+; SSE42-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[TMP3]], align 4
+; SSE42-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2
+; SSE42-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2
+; SSE42-NEXT: [[TMP4:%.*]] = bitcast i16* [[ARRAYIDX5]] to <2 x i16>*
+; SSE42-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* [[TMP4]], align 1
+; SSE42-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP5]] to <2 x i64>
+; SSE42-NEXT: [[TMP7:%.*]] = bitcast i64* [[ARRAYIDX7]] to <2 x i64>*
+; SSE42-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 4
+; SSE42-NEXT: ret i64 undef
+;
+; AVX-LABEL: @test_sext_4i16_to_4i64(
+; AVX-NEXT: entry:
+; AVX-NEXT: [[TMP0:%.*]] = bitcast i16* [[B:%.*]] to <4 x i16>*
+; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 1
+; AVX-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i64>
+; AVX-NEXT: [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to <4 x i64>*
+; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 4
+; AVX-NEXT: ret i64 undef
;
entry:
%0 = load i16, i16* %B, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
index 75406c927d9..f3404831e21 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -160,24 +160,58 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
; SLM-NEXT: ret <4 x i64> [[V3]]
;
-; AVX-LABEL: @loadext_4i8_to_4i64(
-; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; AVX-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; AVX-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64
-; AVX-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64
-; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX-NEXT: ret <4 x i64> [[V3]]
+; AVX1-LABEL: @loadext_4i8_to_4i64(
+; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX1-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64
+; AVX1-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64
+; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i8_to_4i64(
+; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i8_to_4i64(
+; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT: ret <4 x i64> [[V3]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -262,125 +296,34 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
}
define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; SSE-LABEL: @loadext_8i8_to_8i32(
-; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; SSE-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; SSE-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; SSE-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; SSE-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; SSE-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; SSE-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; SSE-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; SSE-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; SSE-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; SSE-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; SSE-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; SSE-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; SSE-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; SSE-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; SSE-NEXT: ret <8 x i32> [[V7]]
-;
-; AVX1-LABEL: @loadext_8i8_to_8i32(
-; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX1-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; AVX1-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; AVX1-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; AVX1-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; AVX1-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; AVX1-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1
-; AVX1-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1
-; AVX1-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1
-; AVX1-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1
-; AVX1-NEXT: [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; AVX1-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i32
-; AVX1-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i32
-; AVX1-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i32
-; AVX1-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i32
-; AVX1-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; AVX1-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; AVX1-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; AVX1-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX1-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; AVX1-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX1-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
-; AVX1-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
-; AVX1-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
-; AVX1-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
-; AVX1-NEXT: ret <8 x i32> [[V7]]
-;
-; AVX2-LABEL: @loadext_8i8_to_8i32(
-; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX2-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; AVX2-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; AVX2-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; AVX2-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; AVX2-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX2-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; AVX2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX2-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX2-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX2-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX2-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX2-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX2-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX2-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX2-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX2-NEXT: ret <8 x i32> [[V7]]
-;
-; AVX512-LABEL: @loadext_8i8_to_8i32(
-; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX512-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; AVX512-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; AVX512-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; AVX512-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; AVX512-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; AVX512-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; AVX512-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; AVX512-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; AVX512-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; AVX512-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; AVX512-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; AVX512-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; AVX512-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; AVX512-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; AVX512-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; AVX512-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; AVX512-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; AVX512-NEXT: ret <8 x i32> [[V7]]
+; CHECK-LABEL: @loadext_8i8_to_8i32(
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; CHECK-NEXT: [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; CHECK-NEXT: [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; CHECK-NEXT: [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; CHECK-NEXT: [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; CHECK-NEXT: [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; CHECK-NEXT: [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; CHECK-NEXT: [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; CHECK-NEXT: [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; CHECK-NEXT: ret <8 x i32> [[V7]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -655,24 +598,58 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
; SLM-NEXT: ret <4 x i64> [[V3]]
;
-; AVX-LABEL: @loadext_4i16_to_4i64(
-; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; AVX-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; AVX-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; AVX-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64
-; AVX-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64
-; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX-NEXT: ret <4 x i64> [[V3]]
+; AVX1-LABEL: @loadext_4i16_to_4i64(
+; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX1-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64
+; AVX1-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64
+; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i16_to_4i64(
+; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX2-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i16_to_4i64(
+; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX512-NEXT: [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT: ret <4 x i64> [[V3]]
;
%p1 = getelementptr inbounds i16, i16* %p0, i64 1
%p2 = getelementptr inbounds i16, i16* %p0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
index a05e4186134..d82aeb85676 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -125,24 +125,58 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
; SLM-NEXT: ret <4 x i64> [[V3]]
;
-; AVX-LABEL: @loadext_4i8_to_4i64(
-; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; AVX-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; AVX-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; AVX-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; AVX-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
-; AVX-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
-; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX-NEXT: ret <4 x i64> [[V3]]
+; AVX1-LABEL: @loadext_4i8_to_4i64(
+; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
+; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
+; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
+; AVX1-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64
+; AVX1-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64
+; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i8_to_4i64(
+; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i8_to_4i64(
+; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT: ret <4 x i64> [[V3]]
;
%p1 = getelementptr inbounds i8, i8* %p0, i64 1
%p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -529,24 +563,58 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
; SLM-NEXT: ret <4 x i64> [[V3]]
;
-; AVX-LABEL: @loadext_4i16_to_4i64(
-; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; AVX-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; AVX-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; AVX-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; AVX-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; AVX-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; AVX-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
-; AVX-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
-; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; AVX-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; AVX-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; AVX-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; AVX-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; AVX-NEXT: ret <4 x i64> [[V3]]
+; AVX1-LABEL: @loadext_4i16_to_4i64(
+; AVX1-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX1-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
+; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
+; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
+; AVX1-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64
+; AVX1-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64
+; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
+; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
+; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; AVX1-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX2-LABEL: @loadext_4i16_to_4i64(
+; AVX2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX2-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX2-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX2-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX2-NEXT: ret <4 x i64> [[V3]]
+;
+; AVX512-LABEL: @loadext_4i16_to_4i64(
+; AVX512-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX512-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX512-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX512-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX512-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX512-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
+; AVX512-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
+; AVX512-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
+; AVX512-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
+; AVX512-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
+; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
+; AVX512-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
+; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
+; AVX512-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
+; AVX512-NEXT: ret <4 x i64> [[V3]]
;
%p1 = getelementptr inbounds i16, i16* %p0, i64 1
%p2 = getelementptr inbounds i16, i16* %p0, i64 2
OpenPOWER on IntegriCloud