summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-11-18 18:11:25 +0000
committerCraig Topper <craig.topper@intel.com>2018-11-18 18:11:25 +0000
commit11d50948e26261bc80a1c745422995d1f02aebb5 (patch)
treed9829b4a3b66bca40cb98bcbbc12d772b600c3ea
parentbc8148f7b056f8cd1c6a2dfe59988b3e1ab4215d (diff)
downloadbcm5719-llvm-11d50948e26261bc80a1c745422995d1f02aebb5.tar.gz
bcm5719-llvm-11d50948e26261bc80a1c745422995d1f02aebb5.zip
[X86] Disable combineToExtendVectorInReg under -x86-experimental-vector-widening-legalization. Add custom type legalization for extends.
If we widen illegal types instead of promoting, we should be able to rely on the type legalizer to create the vector_inreg operations for us with some caveats. This patch disables combineToExtendVectorInReg when we are using widening. I've enabled custom legalization for v8i8->v8i64 extends under avx512f since the type legalizer would want to create a vector_inreg with a v64i8 input type which isn't legal without avx512bw. So we go to v16i8 with custom code using the relaxation of rules we get from D54346. I've also enable custom legalization of v8i64 and v16i32 operations with with AVX. When the input type is 128 bits, the default splitting legalization would extend first 128->256, then do the a split to two 128 pieces. Extend each half to 256 and then concat the result. The custom legalization I've added instead uses a 128->256 bit vector_inreg extend that only reads the lower 64-bits for the low half of the split. Then shuffles the high 64-bits to the low 64-bits and does another vector_inreg extend. llvm-svn: 347172
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp74
-rw-r--r--llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll21
-rw-r--r--llvm/test/CodeGen/X86/vector-sext-widen.ll56
-rw-r--r--llvm/test/CodeGen/X86/vector-zext-widen.ll97
4 files changed, 154 insertions, 94 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1c8f6aad21c..43be941e1a4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1104,6 +1104,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, VT, Custom);
}
+ if (ExperimentalVectorWideningLegalization) {
+ // These types need custom splitting if their input is a 128-bit vector.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ }
+
setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
@@ -1368,6 +1376,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ if (ExperimentalVectorWideningLegalization) {
+ // Need to custom widen this if we don't have AVX512BW.
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
+ }
+
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
setOperationAction(ISD::FCEIL, VT, Legal);
@@ -17636,6 +17651,17 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
+ // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
+ if (InVT == MVT::v8i8) {
+ if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+ return SDValue();
+
+ In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
+ MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
+ // FIXME: This should be ANY_EXTEND_VECTOR_INREG for ANY_EXTEND input.
+ return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, dl, VT, In);
+ }
+
if (Subtarget.hasInt256())
return Op;
@@ -20089,6 +20115,16 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
+ // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
+ if (InVT == MVT::v8i8) {
+ if (!ExperimentalVectorWideningLegalization || VT != MVT::v8i64)
+ return SDValue();
+
+ In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
+ MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
+ return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
+ }
+
if (Subtarget.hasInt256())
return Op;
@@ -26307,6 +26343,41 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ if (!ExperimentalVectorWideningLegalization)
+ return;
+
+ EVT VT = N->getValueType(0);
+ assert((VT == MVT::v16i32 || VT == MVT::v8i64) && "Unexpected VT!");
+ SDValue In = N->getOperand(0);
+ EVT InVT = In.getValueType();
+ if (InVT.is128BitVector()) {
+ // Perform custom splitting instead of the two stage extend we would get
+ // by default.
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+ assert(isTypeLegal(LoVT) && "Split VT not legal?");
+
+ bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
+
+ SDValue Lo = getExtendInVec(IsSigned, dl, LoVT, In, DAG);
+
+ // We need to shift the input over by half the number of elements.
+ unsigned NumElts = InVT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
+ for (unsigned i = 0; i != HalfNumElts; ++i)
+ ShufMask[i] = i + HalfNumElts;
+
+ SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
+ Hi = getExtendInVec(IsSigned, dl, HiVT, Hi, DAG);
+
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ }
+ return;
+ }
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT: {
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -38715,6 +38786,9 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ if (ExperimentalVectorWideningLegalization)
+ return SDValue();
+
unsigned Opcode = N->getOpcode();
if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
return SDValue();
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll
index b87a15442db..13fe0f6220e 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp-widen.ll
@@ -5435,13 +5435,14 @@ define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq 24(%rdi), %rax
-; SSE2-NEXT: movdqu 8(%rdi), %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: movq 24(%rdi), %rax
; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE2-NEXT: movaps %xmm0, 16(%rax)
; SSE2-NEXT: movaps %xmm1, (%rax)
@@ -5450,14 +5451,12 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
; SSE41: # %bb.0:
; SSE41-NEXT: movq 24(%rdi), %rax
-; SSE41-NEXT: movdqu 8(%rdi), %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
+; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0
+; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, (%rax)
-; SSE41-NEXT: movaps %xmm1, 16(%rax)
+; SSE41-NEXT: movaps %xmm0, 16(%rax)
+; SSE41-NEXT: movaps %xmm1, (%rax)
; SSE41-NEXT: retq
;
; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
diff --git a/llvm/test/CodeGen/X86/vector-sext-widen.ll b/llvm/test/CodeGen/X86/vector-sext-widen.ll
index d0969c6607e..44d9ee5b2f9 100644
--- a/llvm/test/CodeGen/X86/vector-sext-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-sext-widen.ll
@@ -495,26 +495,24 @@ entry:
define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_8i64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: psrad $24, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: psrad $24, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: psrad $24, %xmm1
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
@@ -525,26 +523,24 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSSE3-LABEL: sext_16i8_to_8i64:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: psrad $24, %xmm4
; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: psrad $24, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
@@ -912,8 +908,8 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: psrad $16, %xmm3
@@ -938,8 +934,8 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: psrad $31, %xmm3
; SSSE3-NEXT: psrad $16, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: psrad $16, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-zext-widen.ll b/llvm/test/CodeGen/X86/vector-zext-widen.ll
index bc8896f0116..2eafc7cdaf0 100644
--- a/llvm/test/CodeGen/X86/vector-zext-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-zext-widen.ll
@@ -123,15 +123,14 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
;
; AVX1-LABEL: zext_32i8_to_32i16:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
; AVX1-NEXT: retq
;
@@ -398,16 +397,15 @@ entry:
define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -415,15 +413,13 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
;
; SSSE3-LABEL: zext_16i8_to_8i64:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i64:
@@ -585,15 +581,14 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
;
; AVX1-LABEL: zext_16i16_to_16i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
; AVX1-NEXT: retq
;
@@ -884,15 +879,14 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
;
; AVX1-LABEL: zext_8i32_to_8i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vmovaps %ymm2, %ymm0
; AVX1-NEXT: retq
;
@@ -1162,16 +1156,15 @@ entry:
define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i64:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -1179,16 +1172,14 @@ define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
;
; SSSE3-LABEL: load_zext_8i8_to_8i64:
; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2,128,128,128,128,128,128,128,3,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSSE3-NEXT: pshufb %xmm5, %xmm1
+; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: movdqa %xmm3, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i64:
@@ -2229,11 +2220,11 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; AVX2-LABEL: zext_32i8_to_32i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vmovdqa %ymm4, %ymm0
; AVX2-NEXT: retq
OpenPOWER on IntegriCloud